Web Changes

This one checks for changes to webpages and sends me an email if one of them has changed. Works for logged in pages too in theory if you know how to extract the cookie data.

import md5
import urllib2
import smtplib
import time
import getpass

smtpserver = 'smtp.umn.edu'
smtpuser = 'ande7966'
smtppass = getpass.getpass()
RECIPIENTS = ['cmawebsite@gmail.com']
SENDER = 'ande7966@umn.edu'

def send_email(message):
    session = smtplib.SMTP(smtpserver)
    session.starttls()
    session.login(smtpuser, smtppass)
    return session.sendmail(SENDER, RECIPIENTS, message)            

def check_for_changes():
    f = file('urls.txt', 'r')
    urls = []
    for line in f:
        line = line.split()
        if not line:
            continue
        data = {}
        data['url'] = line.pop(0)
        if len(line) > 0:
            data['hash'] = line.pop(0)
        if len(line) > 0:
            data['cookie'] = line.pop(0)
        urls.append(data)
    f.close()

    for row in urls:
        if not row:
            continue
        try:
            req = urllib2.Request(row['url'])
            req.add_header('Cookie', row.get('cookie'))
            response = urllib2.urlopen(req)
            data = response.read()
            response.close()
            new_hash = md5.new(data).hexdigest()
        except Exception, e:
            print e
            if str(e) == "<urlopen error (11001, 'getaddrinfo failed')>":
                print "Internet connection Troubles."
                return
            if str(e) == "<urlopen error (10060, 'Operation timed out')>":
                print "Timed Out"
                continue
            new_hash = str(e).replace(' ', '_')
        if row.get('hash') != new_hash:
            print row['url'] + ' has changed: ' + new_hash
            try:
                print send_email('Subject: %srnrn%s has changed: %s %s' % (row['url'], row['url'], row.get('hash'), new_hash))
            except Exception, e:
                print e.__class__
                print e
                return
            row['hash'] = new_hash

    f = file('urls.txt', 'w')
    for row in urls:
        f.write(row['url'] + ' ' + row.get('hash', '') + ' ' + row.get('cookie', ''))
        f.write('n')
    f.close()

if __name__ == '__main__':
    try:
        while 1:
            print 'Checking now...'
            check_for_changes()
            print 'Checking again in 15 minutes'
            time.sleep(5 * 60)
            print 'Checking again in 10 minutes'
            time.sleep(5 * 60)
            print 'Checking again in 5 minutes'
            time.sleep(5 * 60)
    except Exception, e:
        import traceback
        traceback.print_exc()
    print 'done'
    raw_input()
Advertisements