Here is my python script for grabbing the latest 1000 comments (the api only allows access to the latest 1000 unfortunately) and then checks them against a regexp for matching agains known racist words. Right now it is just looking for the N word. This script will be one of the inner MAP tasks in a series of Map-Reduce steps.

!/usr/bin/env python

import sys

import gdata.youtube

import gdata.youtube.service

import re

racist_pattern = re.compile(‘.igger.’, re.IGNORECASE)

import pprint

pp = pprint.PrettyPrinter(indent=4)

yt_service = gdata.youtube.service.YouTubeService()

yt_service.developer_key = ””     #turns out the developer key isn’t necessary

urlpattern = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=%d&max-results=50’

for line in sys.stdin:

video_id = line.strip()

index = 1

url = urlpattern % (video_id, index)

print url

comments =

while url:

if index < 20:

comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)

comments.extend(comment.content.text for comment in comment_feed.entry)

for comment in comment_feed.entry:

if racist_pattern.match(comment.content.text):

print ‘%s\t%s\n’ % (comment.author0.name.text, comment.content.text)

print ‘Author: %s\t Comment: %s\n’ % (comment.author0.name.text, comment.content.text) for comment in comment_feed.entry

url = comment_feed.GetNextLink().href

index += 1

else:

currently the google youtube gdata api will not support over 1000 comments

url = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=951&max-results=49’ % video_id

comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)

for comment in comment_feed.entry:

if racist_pattern.match(comment.content.text):

print ‘%s\t%s\n’ % (comment.author0.name.text, comment.content.text)

comments.extend(comment.content.text for comment in comment_feed.entry)

print ‘Author: %s\t Comment: %s\n’ % (comment.author0.name.text, comment.content.text) for comment in comment_feed.entry

break

#!/usr/bin/env python
import sys
import gdata.youtube
import gdata.youtube.service
import re
racist_pattern = re.compile('.*igger.*', re.IGNORECASE)
#import pprint
#pp = pprint.PrettyPrinter(indent=4)
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.developer_key = "AI39si7MDdkK_3HKW7C-NykJxoCuBYSBk3GfFDdjEG7tHWmNIZKyLgnvLR9sj6D4wss3IXWQ-oIWm_hB29vb7oOFUCMk8OClMQ"
urlpattern = 'http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=%d&max-results=50'
for line in sys.stdin:
video_id = line.strip()
index = 1
url = urlpattern % (video_id, index)
#print url
comments = []
while url:
if index < 20:
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print '%s\t%s\n' % (comment.author[0].name.text, comment.content.text)
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
url = comment_feed.GetNextLink().href
index += 1
else:
#currently the google youtube gdata api will not support over 1000 comments
url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=951&max-results=49' % video_id
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print '%s\t%s\n' % (comment.author[0].name.text, comment.content.text)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
bre


Published

27 November 2009