Data Analysis for Hacker News Posts¶

In this notebook we will be analysing the data to determing two things

Do Ask HN or Show HN receive more comments on average?
Do posts created at a certain time receive more comments on average?

import csv 

file = open("hacker_news.csv")
hn = list(csv.reader(file))

print(hn[:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]

# removing header and displaying first 5 rows of data only
headers = hn[0]
hn = hn[1:]
print(headers)
print(hn[:5])

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]

# lets create new lists to hold data
ask_posts = []
show_posts = []
other_posts = []

# looping through `hn` to get titles
for row in hn:
    title = row[1]
    title = title.lower()
    if title.startswith('ask hn'):
        ask_posts.append(row)
    elif title.startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
      
    
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))

1744
1162
17194

Lets identify if ask posts or show posts receive more comments on average¶

# average of comments on `Ask HN`
total_ask_comments = 0

for row in ask_posts:
    total_ask_comments += int(row[4])
    

avg_ask_comments = total_ask_comments / len(ask_posts)
print(avg_ask_comments)

14.038417431192661

# average of comments on `Show HN`
total_show_comments = 0

for row in show_posts:
    total_show_comments += int(row[4])
    

avg_show_comments = total_show_comments / len(show_posts)
print(avg_show_comments)

10.31669535283993

Finding the Amount of Ask Posts and Comments by Hour Created¶

import datetime as dt

result_list = []

for post in ask_posts:
    result_list.append(
        [post[6], int(post[4])]
    )

comments_by_hour = {}
counts_by_hour = {}
date_format = "%m/%d/%Y %H:%M"

for each_row in result_list:
    date = each_row[0]
    comment = each_row[1]
    time = dt.datetime.strptime(date, date_format).strftime("%H")
    if time in counts_by_hour:
        comments_by_hour[time] += comment
        counts_by_hour[time] += 1
    else:
        comments_by_hour[time] = comment
        counts_by_hour[time] = 1

comments_by_hour

{'00': 447,
 '01': 683,
 '02': 1381,
 '03': 421,
 '04': 337,
 '05': 464,
 '06': 397,
 '07': 267,
 '08': 492,
 '09': 251,
 '10': 793,
 '11': 641,
 '12': 687,
 '13': 1253,
 '14': 1416,
 '15': 4477,
 '16': 1814,
 '17': 1146,
 '18': 1439,
 '19': 1188,
 '20': 1722,
 '21': 1745,
 '22': 479,
 '23': 543}

# Calculate the average amount of comments `Ask HN` posts created at each hour of the day receive.
avg_by_hour = []

for hr in comments_by_hour:
    avg_by_hour.append([hr, comments_by_hour[hr] / counts_by_hour[hr]])

avg_by_hour

[['11', 11.051724137931034],
 ['03', 7.796296296296297],
 ['00', 8.127272727272727],
 ['18', 13.20183486238532],
 ['17', 11.46],
 ['10', 13.440677966101696],
 ['04', 7.170212765957447],
 ['16', 16.796296296296298],
 ['14', 13.233644859813085],
 ['08', 10.25],
 ['19', 10.8],
 ['15', 38.5948275862069],
 ['09', 5.5777777777777775],
 ['07', 7.852941176470588],
 ['02', 23.810344827586206],
 ['13', 14.741176470588234],
 ['20', 21.525],
 ['23', 7.985294117647059],
 ['21', 16.009174311926607],
 ['22', 6.746478873239437],
 ['06', 9.022727272727273],
 ['01', 11.383333333333333],
 ['12', 9.41095890410959],
 ['05', 10.08695652173913]]

swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
print(swap_avg_by_hour)

sorted_swap = sorted(swap_avg_by_hour, reverse=True)

sorted_swap

[[11.051724137931034, '11'], [7.796296296296297, '03'], [8.127272727272727, '00'], [13.20183486238532, '18'], [11.46, '17'], [13.440677966101696, '10'], [7.170212765957447, '04'], [16.796296296296298, '16'], [13.233644859813085, '14'], [10.25, '08'], [10.8, '19'], [38.5948275862069, '15'], [5.5777777777777775, '09'], [7.852941176470588, '07'], [23.810344827586206, '02'], [14.741176470588234, '13'], [21.525, '20'], [7.985294117647059, '23'], [16.009174311926607, '21'], [6.746478873239437, '22'], [9.022727272727273, '06'], [11.383333333333333, '01'], [9.41095890410959, '12'], [10.08695652173913, '05']]

[[38.5948275862069, '15'],
 [23.810344827586206, '02'],
 [21.525, '20'],
 [16.796296296296298, '16'],
 [16.009174311926607, '21'],
 [14.741176470588234, '13'],
 [13.440677966101696, '10'],
 [13.233644859813085, '14'],
 [13.20183486238532, '18'],
 [11.46, '17'],
 [11.383333333333333, '01'],
 [11.051724137931034, '11'],
 [10.8, '19'],
 [10.25, '08'],
 [10.08695652173913, '05'],
 [9.41095890410959, '12'],
 [9.022727272727273, '06'],
 [8.127272727272727, '00'],
 [7.985294117647059, '23'],
 [7.852941176470588, '07'],
 [7.796296296296297, '03'],
 [7.170212765957447, '04'],
 [6.746478873239437, '22'],
 [5.5777777777777775, '09']]

# Sort the values and print the the 5 hours with the highest average comments.

print("Top 5 Hours for 'Ask HN' Comments")
for avg, hr in sorted_swap[:5]:
    print(
        "{}: {:.2f} average comments per post".format(
            dt.datetime.strptime(hr, "%H").strftime("%H:%M"),avg
        )
    )

Top 5 Hours for 'Ask HN' Comments
15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post