#jofish 13 sept 2007 # #takes as input a series of files named conferencename1998.htm, etc. in the same directory. #these can be downloaded from the proceedings section of the ACM Digital Library #case of the conferencename matters! # #generates a series of files called conferencename1998.txt with one name per line. #doesn't change html characters (ie é) to any other representation. #that could be fixed. or not. conferencename = "CSCW" #change this to your conference name from os import listdir filelist = listdir(".") filelist.sort() for filename in filelist: #looks for cscw2001.htm cscw2002.htm etc if filename.lower().split('.')[-1]!='htm' or conferencename not in filename: continue #ignore other files outfile = open(filename.split('.')[0]+'.txt','w') year = filename.split()[0].split(conferencename)[1].split('.')[0] #sloppy, i know, but it'll work print "\nyear :",year for line in open(filename,'r'): #Incantation to the Gods of the ACM DL who believeth not in XML begins. if "citation.cfm" not in line: continue #I wish this was XML for phrase in line.lower().split('