import requests
import sys
import re
def get_emails( source ):
"Scrapes all emails from string"
if source:
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
return (email[0] for email in re.findall(regex, source) if not email[0].startswith('//'))
def get_source( url ):
"Gets web page source"
response = requests.get(url)
if response.status_code == 200:
return response.text
def print_emails( emails ):
"Prints emails from array"
emails_set = set(emails)
for email in emails_set:
print email
def check_args():
"Checks if url is passed to script"
return len(sys.argv) == 2
def print_usage():
"Prints usage informations"
print "\nUsage: python " + sys.argv[0] + " <page url>"
if __name__ == "__main__":
if check_args() == True:
try:
print_emails(get_emails(get_source(sys.argv[1])))
except requests.exceptions.RequestException as e:
print e
else:
print_usage()
This script scrapes all emails from url (web page) passed as the first script argument.
Updated: Convert LIST of emails to SET to remove duplicate emails.
Updated: Convert LIST of emails to SET to remove duplicate emails.
Be the first to comment
You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.