Scrape emails from web page

import requests import re import argparse def get_emails( source ): "Scrapes all emails from string" if source: regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) return (email[0] for email in re.findall(regex, source) if not email[0].startswith('//')) def get_source( url ): "Gets web page source" response = requests.get(url) if response.status_code == 200: return response.text def print_emails( emails ): "Prints emails from array" emails_set = set(emails) for email in emails_set: print email def main(url): try: print_emails(get_emails(get_source(url))) except requests.exceptions.RequestException as e: print e if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="website / web page url", type=str) args = parser.parse_args() main(args.url)
This script scrapes all emails from url (web page) passed as the first script argument.

Updated: Convert LIST of emails to SET to remove duplicate emails.

Be the first to comment

You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.