import requests
import re
import argparse
def get_emails( source ):
"Scrapes all emails from string"
if source:
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
return (email[0] for email in re.findall(regex, source) if not email[0].startswith('//'))
def get_source( url ):
"Gets web page source"
response = requests.get(url)
if response.status_code == 200:
return response.text
def print_emails( emails ):
"Prints emails from array"
emails_set = set(emails)
for email in emails_set:
print email
def main(url):
try:
print_emails(get_emails(get_source(url)))
except requests.exceptions.RequestException as e:
print e
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("url", help="website / web page url", type=str)
args = parser.parse_args()
main(args.url)
This script scrapes all emails from url (web page) passed as the first script argument.
Updated: Convert LIST of emails to SET to remove duplicate emails.
Updated: Convert LIST of emails to SET to remove duplicate emails.
Be the first to comment
You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.