#!/usr/bin/python
#
# This is -*- python -*- code
#

USER_AGENT = 'FindNew/0.6' # Put this here so I don't forget to update it

# FindNew.py 0.6:
#
#   Traverses a bookmarks file or web page to find out what items
#   have been updated lately.  Similar to a feature in Netscape 2.0's
#   bookmarks, but more HTML-friendly.
#
#   Chris Lawrence <quango@ix.netcom.com>
#   Freely distributable
#   16 May 1998

# TO DO:
#   Use multiple sockets at once
#   Better exceptions

# Changes since 0.5:
#   Now uses Python 1.5+'s rfc822.mktime_tz() function; this may even work
#   right in 1.5.2 (patches for 1.5.1 should be on the Python patch page soon)

# Changes since 0.4:
#   Changes toward HTTP/1.1 compliance
#   . Host header
#   . Connection header
#   Updated for Python 1.5; probably incompatible with < 1.5

# Changes since 0.3:
#   Now handles timezones better.  Requires patch to rfc822.py posted to
#      comp.lang.python by Guido (NOT my patch)
#   Shows the dates of modified pages
#   You can now specify the number of days old a page can be to be considered
#      updated.
#   Added easier ignored services list (rather than mega-if)
#   Ignore nntp and pop3 URLs
#   Fixed a bug that made unupdated pages look like they had no modification
#      time.

# Changes since 0.2:
#   It now makes some HTML and spawns your web browser to look
#      at the results.

# Changes since 0.1:
#   What it says is actually correct ;)
#   Better handling of mail and news URLs
#   No longer hosed if you don't have a WWW_HOME env var.

import sys, os, time, urlparse, string, httplib, htmllib, urllib, formatter
import socket, rfc822

from types import *

HOME_URL = 'http://www.clark.net/pub/lawrencc/linux/findnew.py'

WEB_BROWSER = 'lynx'

PROXY = None                        # Use 'hostname:port'
TMP_PAT = '/tmp/jumplist%d.html'    # Lynx seems to need the extension

DEBUG = 1 # Use 0 to switch off debugging

# pop3 is used by Netscape
IgnoreServices = ['mailto', 'news', 'telnet', 'newspost', 'nntp', 'pop3']

def check_url(url):
        servname, hostandport, server_url, parameters, query, fragment =\
                  urlparse.urlparse(url, 'file');

        if server_url == '':
                server_url = '/'

        if servname == 'http':
                # All we do is send a HEAD request, and hope
                # like hell that we're dealing with a HTTP 1.0+
                # compliant server (i.e. not an ancient CERN or NCSA)
                if PROXY is None:
                        hreq = httplib.HTTP(hostandport)
                        hreq.putrequest('HEAD',server_url)
                else:
                        # Hopefully the proxy is smart enough to strip
                        # any fragment from the request [if not, who cares?]
                        hreq = httplib.HTTP(PROXY)
                        hreq.putrequest('HEAD',url)

                hreq.putheader('User-Agent',USER_AGENT) # To be polite
                hreq.putheader('Host', hostandport)   # HTTP/1.1 Host header
                hreq.putheader('Connection', 'close') # Non-persistent cnxn
                hreq.endheaders()

                errcode, errmsg, headers = hreq.getreply()
                if errcode == 302 or errcode == 301:
                        # Handle redirected URLs correctly
                        urlretry = headers.getheader('Location')
                        if urlretry is None:
                                raise IOError, 'Unable to obtain new '+\
                                      'location.'
                        return check_url(urlretry)
                elif errcode == -1:
                        raise IOError, 'Invalid response from server: '+\
                              `errmsg`
                elif errcode != 200:
                        raise IOError, 'Unhandled reply from server: '+\
                              `errcode`+' '+`errmsg`

                datecode = headers.getdate_tz('Last-modified')
                return datecode

        elif servname == 'ftp' or servname == 'file':
                # Probably easy enough to implement...
                # but I'm lazy
                raise ValueError, "ftp service unimplemented"

        elif servname == 'https':
                raise ValueError, 'SSL secure HTTP unimplemented'

        elif servname in IgnoreServices:
                return 0

        else:
                raise ValueError, servname+" service unimplemented"

def datestr(timeval):
        try:
                return time.strftime('%b %d %Y at %I:%M %p %Z', 
                                     time.localtime(timeval))
        except:
                return 'Invalid date: '+`timeval`

def is_url_newer(url, lastmod):
        try:
                date = check_url(url)
                if date:
                        juldate = rfc822.mktime_tz(date)
			# print datestr(juldate)
                        if (juldate > lastmod):
                                return juldate
                        else:
                                return 0
                elif date == 0:
                        return 0
                else:
                        return -1

        except (IOError, ValueError, socket.error), x:
                return str(x)

def main():
        # We'll play with the HTML parsing in Python for this stuff
        # Basically we retrieve the specified URL, parse for "A"
        # tags with HREF's, and go from there.

        sincedate = 2
        if len(sys.argv) > 1:
                url = sys.argv[1]
                if len(sys.argv) > 2:
                        sincedate = string.atoi(sys.argv[2])
        elif os.environ.has_key('WWW_HOME'):
                url = os.environ['WWW_HOME']
        else:
                print 'You must specify a URL or a filename.'
                return

        # Handle modification times
        if sincedate < 1: sincedate = 1

        lastchange = time.time() - (sincedate*24*60*60)

        if DEBUG:
                print 'Checking for pages updated since '+\
                      datestr(lastchange)

        # Canonicalize the URL
        stuff = urlparse.urlparse(url, 'file')
        url = urlparse.urlunparse( (stuff[0], stuff[1],
                                    os.path.join(os.getcwd(), stuff[2]),
                                    stuff[3], stuff[4], stuff[5]) )

        if DEBUG:
                print 'Canonical URL:',url

        data = urllib.urlopen(url).read();

        parser = htmllib.HTMLParser(formatter.NullFormatter(None))
        parser.feed(data)
        
        if(parser.base is not None):
                url = parser.base

        pagelist = {}

        for x in parser.anchorlist:
                (pagex, blah) = urlparse.urldefrag(urlparse.urljoin(url, x))

                if not pagelist.has_key(pagex):
                        if DEBUG:
                                print 'Checking',pagex
                        ok = is_url_newer(pagex, lastchange)
                        pagelist[pagex] = (ok, (x))
                else:
                        pagelist[pagex] = (pagelist[pagex][0], \
                                           pagelist[pagex][1]+(x))

        if len(pagelist) == 0:
                sys.stderr.write('No pages updated.\n')
                parser.close()
                return

        filename = TMP_PAT % os.getpid()
        outfile = open(filename, 'w')

        outfile.write('<HTML>\n<HEAD>\n<TITLE>%s results</TITLE>\n'\
                      % USER_AGENT)

        outfile.write('</HEAD>\n<BODY>\n')

        outfile.write('<P>The URL <A HREF="%s">%s</A> has links to the '\
                      'following pages that have possibly been modified '\
                      'since %s.</P>\n' % (url, url, datestr(lastchange)))

        outfile.write('<UL>\n')

        for x,y in pagelist.items():
                if type(y[0]) is StringType:
                        outfile.write('<LI> <A HREF="%s">%s</A>: %s\n' \
                                      % (x, x, y[0]) )
                # Might really want to ignore these... *shrug*
                elif y[0] == -1:
                        outfile.write('<LI> <A HREF="%s">%s</A>: no '\
                                      'modification time available.\n' % (x,x))
                elif y[0]:
                        outfile.write('<LI> <A HREF="%s">%s</A> '\
                                      'was updated (%s).\n' %
                                      ( x,x,datestr(y[0]) ))

        outfile.write('</UL>\n')

        outfile.write('<HR><P>Generated by <A HREF="%s">%s</A>, by ' \
                      'Chris Lawrence &lt;quango@ix.netcom.com&gt;.</P>\n' \
                      % (HOME_URL, USER_AGENT))

        outfile.write('</BODY>\n</HTML>\n')
        outfile.close()

        if os.fork():
                os.wait()
        else:
                os.execvp(WEB_BROWSER, (WEB_BROWSER, filename))

        os.remove(filename)
        parser.close()

if __name__ == '__main__':
        main()
