#Takes as input a directory called conference name which contains #a series of files called conferencename1998.txt, etc. #which contain one name per line, optionally including HTML character encoding (like é) #Outputs a file called conferencename.jpg #Change conferencename so it knows what to look for. #You'll need the Python Imaging Library installed. #jofish #v1.22 #written 14 sept 2007: happy birthday janet vertesi! #modified 4 Feb 2009 to work on macs as well conferencename = 'CSCW' from PIL import Image, ImageDraw, ImageFont #you'll need to install the Python Imaging Library, PIL, if you don't have it. from os import listdir import re allauthors = {} yearcount = 0 mostauthors = 0 colours = ['red','orange','green','blue','indigo','violet']#yellow is unreadble authordict = {} fullnames = {} totalauthors = 0 def initialize(name): return name.split()[-1] + " " + name[0] #ie, Kaye J def yeartocolor(year): return colours[allyears.index(year) %len(colours)] #could be more elaborate. def firstyear(author): return authordict[author]+author # ie 1983norman d. should give us years + alphabetized for free! def unescape(text): '''converts from html chars to the characters themselves from:http://effbot.org/zone/re-sub.htm#unescape-html''' def fixup(m): text = m.group(0) if text[:2] == "&#":# character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else:# named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError:pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def unique(s): """sorting taken from the python cookbook; their comments removed http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560 """ n = len(s) if n == 0: return [] u = {} try: for x in s: u[x] = 1 except TypeError: del u # move on to the next method else: return u.keys() try: t = list(s) t.sort() except TypeError: del t # move on to the next method else: assert n > 0 last = t[0] lasti = i = 1 while i < n: if t[i] != last: t[lasti] = last = t[i] lasti += 1 i += 1 return t[:lasti] u = [] # Brute force is all that's left. for x in s: if x not in u: u.append(x) return u def sortedDictValues(adict): #from the python cookbook again. but so long ago, i forget where. items = adict.items() items.sort() return [value for key, value in items] filelist = listdir(conferencename) #ok let's look in the directory and see what's in there. filelist.sort() for filename in filelist: #so first, let's check each filename to check it's of the form conferencenameYYYY.txt if filename.lower()[-3:]!='txt' or filename.lower()[:len(conferencename)] != conferencename.lower(): continue yearcount += 1 yearauthors = 0 year = filename.lower().split(conferencename.lower())[1].split('.')[0] #bit sloppy. allauthors[year]= []; print "~",filename for line2 in open(conferencename+'/'+filename): #if "\r" in line2: for line in line2.split('\r'): if line.strip() == 'jr' or line.strip() == 'jr.': continue #there can be some problems with those. name = line.strip() #not unescape()d yet, because it causes problems when we go out of ascii this early. if name == "": continue #get rid of blank lines. sigh. initials = initialize(name) #ok, so we're assuming that F. Last is always the same. #so we do run a risk doing this. but it seems to work for the datasets i built to do this. #it'll break down if you have a lot of Chinese authors in the field you're looking at, for example. Be aware. if initials not in allauthors[year]:#don't put in duplicates. you could leave them in, depending on what you're trying to show. allauthors[year].append(initials) yearauthors +=1 if initials not in fullnames: #so we can pull out the fullnames later. fullnames[initials] = name if initials not in authordict:#if we haven't seen them before ever authordict[initials] = year else: if authordict[initials]>year: #put in the earlist year authordict[initials] = year if yearauthors > mostauthors: mostauthors = yearauthors #so we can size the image properly print "Year:",year,"Number of authors: ",yearauthors totalauthors +=yearauthors print "Total authors:",totalauthors #now we've got the data, let's start making the image. width = 120 #width of each display column space = 4 #space in between them nameheight = 12 imgxsize = width * yearcount imgysize = mostauthors * nameheight + 420 if imgysize < 420: imgysize = 420 #so we've got room for titles and all try: bigfont = ImageFont.truetype("arial.ttf",20) hugefont = ImageFont.truetype("ARIBLK.TTF",yearcount*8) percentfont = ImageFont.truetype("arial.ttf",25) smallfont = ImageFont.truetype("arialn.ttf",14) #ariel narrow except: bigfont = ImageFont.truetype("Helvetica.ttf",20) hugefont = ImageFont.truetype("Helvetica.ttf",yearcount*8) percentfont = ImageFont.truetype("Helvetica.ttf",25) smallfont = ImageFont.truetype("Helvetica.ttf",14) #ariel narrow else: print "EPIC FAIL! None of the fonts worked." myimg = Image.new("RGB",(imgxsize,imgysize),"#FFFFFF") draw=ImageDraw.Draw(myimg) allyears = allauthors.keys() allyears.sort() repeatssum=0 repeatcount=-1 #because year zero will always have zero repeats, so that's not interesting. yearssofar = 0 for year in allyears: #and now let's process through one year at a time. draw.text(((width*yearssofar)+space, imgysize-30), year, font=bigfont, fill='black') namecount = 0 #this next line seems to crap out on [some] unicode stuff. allauthors[year].sort(key=firstyear) #now we need to sort that year, by year they first showed up. new = 0.0 #we'll need to know this to calculate repeats percentage old = 0.0 for name in allauthors[year]: #so for each author namecount +=1 theirfirstyear = authordict[name] color = yeartocolor(theirfirstyear) if theirfirstyear==year: new +=1 else: old+=1 #this is where we'd do the HTML-to-text conversion for things like é draw.text(((width*yearssofar)+space, imgysize-(50+(namecount*nameheight))), unescape(fullnames[name]), font=smallfont, fill=color) #now let's do the math to figure out what percentage of those have been here before. if old+new == 0: repeat = 0 else: repeat = (old/(old+new))*100 repeats = str('%2.0f' % repeat) repeatssum +=repeat repeatcount +=1 repeatstring = str(int(old))+'/'+str(int(old+new)) draw.text(((width*yearssofar)+space, imgysize-(100+(namecount*nameheight))), repeatstring, font=percentfont, fill='Grey') draw.text(((width*yearssofar)+space, imgysize-(75+(namecount*nameheight))), repeats+"% new", font=percentfont, fill='Grey') yearssofar +=1; #put in the title list at the top (might not look right on very large or small datasets) averagerepeats = str('%2.0f' % (repeatssum / repeatcount)) draw.text((20,20), conferencename+" "+min(allyears)+"-"+max(allyears), font=hugefont, fill='black') draw.text((20,60+(yearcount*8)), "Grey numbers are percentage", font=percentfont, fill='black') draw.text((20,width/2+60+(yearcount*8)), "of repeat authors (average "+averagerepeats+"%)", font=percentfont, fill='black') draw.text((20,200+(yearcount*8)), "Joseph 'Jofish' Kaye, Nokia Research Center, Palo Alto", font=bigfont, fill='black') draw.text((20,240+(yearcount*8)), "http://posters.jofish.com", font=bigfont, fill='gray') myimg.save(conferencename+".png",quality=95) print "Created image file: ",conferencename + ".png"