Scrape emails from web page

import requests import sys import re def get_emails( source ): "Scrapes all emails from string" if source: regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) return (email[0] for email in re.findall(regex, source) if not email[0].startswith('//')) def get_source( url ): "Gets web page source" response = requests.get(url) if response.status_code == 200: return response.text def print_emails( emails ): "Prints emails from array" emails_set = set(emails) for email in emails_set: print email def check_args(): "Checks if url is passed to script" return len(sys.argv) == 2 def print_usage(): "Prints usage informations" print "\nUsage: python " + sys.argv[0] + " <page url>" if __name__ == "__main__": if check_args(): try: print_emails(get_emails(get_source(sys.argv[1]))) except requests.exceptions.RequestException as e: print e else: print_usage()
This script scrapes all emails from url (web page) passed as the first script argument.

Updated: Convert LIST of emails to SET to remove duplicate emails.

Be the first to comment

You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.