Web Scraping with Selenium and Python

from selenium import webdriver import time import os import datetime as dt from BeautifulSoup import BeautifulSoup from pyvirtualdisplay import Display import json import sys,getopt import datetime chromedriver = "/usr/bin/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver display = Display(visible=0, size=(800, 600)) display.start() driver = webdriver.Chrome(chromedriver) # driver = webdriver.Firefox() url = "http://www.flashscore.com" driver.implicitly_wait(10) list_of_sports = ['soccer', 'tennis', 'baseball', 'american-football', 'hockey', 'basketball'] def get_json_of_sport(sport, sport_web_driver, date, data): table_main = sport_web_driver.find_element_by_class_name('table-main') soup = BeautifulSoup(table_main.get_attribute('innerHTML')) for table in soup.findAll("table", {"class": sport}): league = table.find("span", {"class": 'country_part'}).text nation = table.find("span", {"class": 'tournament_part'}).text print "-------------------------------------------------------------------------------------------" print table.find("span", {"class": 'country_part'}).text, table.find("span", {"class": 'tournament_part'}).text print "--------------------------------------------------------------------------------------------" trs = table.findAll("tr", {"class": lambda x: x and "stage-scheduled" in x.split()}) or \ table.findAll("tr", {"class": lambda x: x and "stage-finished" in x.split()}) or \ table.findAll("tr", {"class": lambda x: x and "stage-live" in x.split()}) for index, tr in enumerate(trs): match_json = {} try: time_of_match = tr.find("td", {"class": lambda x: x and "time" in x.split()}).text except: continue team_home = tr.find("td", {"class": lambda x: x and "team-home" in x.split()}).text try: team_away = tr.find("td", {"class": lambda x: x and "team-away" in x.split()}).text except: team_away = trs[index+1].find("td", {"class": lambda x: x and "team-away" in x.split()}).text match_json['homeTeam'] = team_home match_json['awayTeam'] = team_away match_json['Date'] = date match_json['Hour'] = time_of_match match_json['League'] = league match_json['Nation'] = nation data.append(set(match_json)) print time_of_match, team_home, team_away for sport in list_of_sports: url2 = url + '/' + sport + '/' driver.get(url2) time.sleep(10) days = 7 date_of_match = '' if sys.argv[1:]: print "==================================" argv = sys.argv[1:] for i in argv: date_of_match = datetime.datetime.strptime(i,'%Y/%m/%d').strftime("%Y/%m/%d") else: print "dfdfdf" date_of_match = dt.datetime.now().strftime("%Y/%m/%d") date_int = dt.datetime.now() for i in range(0, days): data = [] get_json_of_sport(sport, driver, date_of_match, data) json_data = json.dumps(data) date_string = date_int.strftime("%Y_%m_%d") print json_data filename = sport + "_" + date_string + '.txt' if os.path.isfile(filename): os.remove(filename) with open(filename, 'w') as outfile: json.dump(json_data, outfile) time.sleep(10) print "\n" try: driver.find_element_by_css_selector('span.day.tomorrow').click() except: break date_int = dt.datetime.now() + dt.timedelta(days=1) # date_of_match = (dt.datetime.now() + dt.timedelta(days=1)).strftime("%Y/%m/%d") date_of_match = '' if sys.argv[1:]: print"----------------" argv = sys.argv[1:] for i in argv: date_of_match = datetime.datetime.strptime(i,'%Y/%m/%d').strftime("%Y/%m/%d")+ dt.timedelta(days=1).strftime("%Y/%m/%d") else: print "fdfdf" date_of_match = (dt.datetime.now() + dt.timedelta(days=1)).strftime("%Y/%m/%d") driver.close()
Imagine what would you do if you could automate all the repetitive and boring activities you perform using internet, like checking every day the first results of Google for a given keyword, or download a bunch of files from different websites.

In this code you’ll learn to use Selenium with Python, a Web Scraping tool that simulates a user surfing the Internet. For example, you can cial accounts, simulate a user to test your web application, and anything you find in your daily live that it’s repetitive. The possibilities are infinite! :-)

Here my example code for scrap the data from the sports website. grab all the data and filter the data according to category's like football,cricket,basketball etc , this code will help you to detail understand about the working selenium with python ,and how to scrap the data using the technology

Requirements:

Step 1 : Create Virtual ENV

You need to install virtual environments in your local machine if virtualenv is installed in your system create a virtualenv using this command : virtualenv scrapy. if you dont installed the virtual env install virtualenv in your root in your machine : sudo pip install virtualenv. activate the env using source scrapy/bin/activate.

Step 2 : Install dependencies in your env.

BeautifulSoup==3.2.1
EasyProcess==0.1.9
PyVirtualDisplay==0.1.5
argparse==1.2.1
beautifulsoup4==4.4.1
selenium==2.47.3
wsgiref==0.1.2
Step 3 : download the code from the git hub and run it. you can see the script downloading the match details accodring to category wise and make it in txt file.
This code you can run it two way with argument and with arguments.
if you run the code as python filename.py : you can see the details according to today and tomor

Be the first to comment

You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.