#jofish 13 sept 2007 # #takes as input a series of files named conferencename1998.htm, etc. in the same directory. #these can be downloaded from the proceedings section of the ACM Digital Library #case of the conferencename matters! # #generates a series of files called conferencename1998.txt with one name per line. #doesn't change html characters (ie é) to any other representation. #that could be fixed. or not. conferencename = "CSCW" #change this to your conference name from os import listdir filelist = listdir(".") filelist.sort() for filename in filelist: #looks for cscw2001.htm cscw2002.htm etc if filename.lower().split('.')[-1]!='htm' or conferencename not in filename: continue #ignore other files outfile = open(filename.split('.')[0]+'.txt','w') year = filename.split()[0].split(conferencename)[1].split('.')[0] #sloppy, i know, but it'll work print "\nyear :",year for line in open(filename,'r'): #Incantation to the Gods of the ACM DL who believeth not in XML begins. if "citation.cfm" not in line: continue #I wish this was XML for phrase in line.lower().split('
""" not in phrase.lower(): continue #I can't believe I'm doing this. #Ok, i think we've got one. Incantation over. names= phrase.split("
")[1].split("
")[0] #Uggg. Still dirty. for fullname in names.split(","): name = fullname.strip().lower() #might want to take this out in the future. not for now. if len(name) > 50: continue; #we get some crud in CHI1983 for some reason. outfile.write(name + "\n"); #so we're not dealing with duplicates or sorting. outfile.close()