from bs4 import BeautifulSoup
import json
def getHtml(url):
import urllib2
html = urllib2.urlopen(url)
soup = BeautifulSoup(html,'lxml')
return soup
def putFile(file, data):
file = open(file, "w")
file.write(data)
file.close()
def firstPage(html):
linkDivs = html.findAll("div", {"class": "small-meta__item--comment"})
urls = ''
for link in linkDivs:
foundLink = link.find('a', href=True).get('href')
urls += "http://buzzfeed.com/"+ str(foundLink) +","
return urls
def otherPages(page):
url = 'http://buzzfeed.com/plugin/midcolumn/v:1.1/p:'+str(page)
html = getHtml(url)
linkDivs = html.findAll("div",{"class" : "lede__body"})
urls = ''
for link in linkDivs:
foundLink = link.find('a',href=True).get('href')
urls += "http://buzzfeed.com"+ str(foundLink) +","
return urls
def getLinks(url, limit = 36, msg = False):
html = getHtml(url)
if (limit < 37):
if msg:
print "Ah! Single page only"
return firstPage(html)
else:
if msg:
print "On page 0 fetching 0 - 36"
urls = firstPage(html)
if (urls):
loaded = 36
page = 1
while(loaded < limit):
if msg:
print "On page "+str(page)+" fetching "+str(loaded)+" - "+str(loaded+loaded)
urls += otherPages(page)
loaded = loaded + 30
page = page + 1
return urls
links = getLinks('http://www.buzzfeed.com')
linksarray = links.split(",")
print json.dumps(linksarray)
Be the first to comment
You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.