How To Scrap BuzzFeed With Python & BeautifulSoup 4

from bs4 import BeautifulSoup import json def getHtml(url): import urllib2 html = urllib2.urlopen(url) soup = BeautifulSoup(html,'lxml') return soup def putFile(file, data): file = open(file, "w") file.write(data) file.close() def firstPage(html): linkDivs = html.findAll("div", {"class": "small-meta__item--comment"}) urls = '' for link in linkDivs: foundLink = link.find('a', href=True).get('href') urls += "http://buzzfeed.com/"+ str(foundLink) +"," return urls def otherPages(page): url = 'http://buzzfeed.com/plugin/midcolumn/v:1.1/p:'+str(page) html = getHtml(url) linkDivs = html.findAll("div",{"class" : "lede__body"}) urls = '' for link in linkDivs: foundLink = link.find('a',href=True).get('href') urls += "http://buzzfeed.com"+ str(foundLink) +"," return urls def getLinks(url, limit = 36, msg = False): html = getHtml(url) if (limit < 37): if msg: print "Ah! Single page only" return firstPage(html) else: if msg: print "On page 0 fetching 0 - 36" urls = firstPage(html) if (urls): loaded = 36 page = 1 while(loaded < limit): if msg: print "On page "+str(page)+" fetching "+str(loaded)+" - "+str(loaded+loaded) urls += otherPages(page) loaded = loaded + 30 page = page + 1 return urls links = getLinks('http://www.buzzfeed.com') linksarray = links.split(",") print json.dumps(linksarray)

Be the first to comment

You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.