Harvesting Emails

Whenever I start a penetration test, I will gather as much data as I can about the target. When I want to use Social Engineering as an attack vector, I typically like to use spear phishing techniques. In order to help me out, I created a simple python script that will spider a website(s) and will extract any email that it comes across.

import urllib,threading,re,time,os,string
import thread
from HTMLParser import HTMLParser
from urllib2 import urlopen
#************************************

sitesToScan=['http://www.SomeURL.com','http://www.someURL2.com']

depth = 100
max_span = 100

mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
#*************************************

class Spider(HTMLParser):
  def __init__(self, starting_url, depth, max_span):
        HTMLParser.__init__(self)
        self.url = starting_url
        self.db = {self.url: 1}
        self.node = [self.url]

        self.depth = depth # recursion depth max
        self.max_span = max_span # max links obtained per url
        self.links_found = 0

  def handle_starttag(self, tag, attrs):
        if self.links_found < self.max_span and tag == 'a' and attrs:
            link = attrs[0][1]
            if link[:4] != "http":
                link = '/'.join(self.url.split('/')[:3])+('/'+link).replace('//','/')

            if link not in self.db:
                #print "new link ---> %s" % link
                self.links_found += 1
                self.node.append(link)
            self.db[link] = (self.db.get(link) or 0) + 1

  def crawl(self):
     while 1:
        for depth in xrange(self.depth):
            #print "*"*70+("\nScanning depth %d web\n" % (depth+1))+"*"*70
            context_node = self.node[:]
            self.node = []
            for self.url in context_node:
                self.links_found = 0
                try:
                    req = urlopen(self.url)
                    res = req.read()
                    urlScans = "scanning, %s " % self.url
                    print urlScans
                    emails = mailsrch.findall(res)
                    for email in emails: 
                        newEmail = "%s, %s " % (self.url, email)
                        lock.acquire()
                        log = open('emails.txt','a')
                        log.write(newEmail+'\n')
                        log.close()
                        time.sleep(1)
                        print newEmail
                        lock.release()
                        time.sleep(1)
                    self.feed(res)
                except:
                    self.reset()
        zorted = [(v,k) for (k,v) in self.db.items()]
        zorted.sort(reverse = True)
        return zorted

def startACrawl(starting_url):
    print "---------------------  NEW THREAD STARTED ----------------"
    spidey = Spider(starting_url, depth, max_span)
    spidey.crawl()

if __name__ == "__main__":
  lock=thread.allocate_lock()
  for starting_url in sitesToScan:
     thread.start_new_thread(startACrawl, (starting_url,))
while 1:
  pass

Posted in Uncategorized.