Sortiere die D-INFK Kurse der ETH nach Bewertung über die veröffentlichten Vorlesungsevaluationen im Intranet.
Geht sicher schöner, aber sonst schreibe ich kein Python. Benötigt pdfminer und Zugang zum ETHZ-Intranet.
Code:
import urllib2
import sys
import base64
import re
import sgmllib
import pdfminer
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import XMLConverter
urls = ("https://www1.ethz.ch/inf/intranet/evaluation/fs11", "https://www1.ethz.ch/inf/intranet/evaluation/hs11/index")
username = "nethzusername"
password = "nethzpasswort"
tmpfile = "tmp.pdf"
mwOfInterest = 10
class MyParser(sgmllib.SGMLParser):
def parse(self, s):
self.feed(s)
self.close()
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.pdfs = []
self.titles = []
self.pdffinder = re.compile(r'''.*\.pdf''', re.IGNORECASE)
self.titlefinder = re.compile(r'''.*\'(.*)\'.*''', re.IGNORECASE)
def start_a(self, attributes):
active = False
for name, value in attributes:
if name == "href":
if (self.pdffinder.match(value)):
active = True
self.pdfs.append(value)
if name == "title":
if (active):
self.titles.append(self.titlefinder.match(value).group(1))
def get_pdfs(self):
return self.pdfs
def get_titles(self):
return self.titles
class MyOutWriter():
def __init__(self):
self.lines = []
def write(self, string):
self.lines.append(string)
def get_lines(self):
return self.lines
def geturl(theurl, username, password):
print "Getting url '%s'..." % theurl
req = urllib2.Request(theurl)
try:
handle = urllib2.urlopen(req)
except IOError, e:
print "URL requires authorization. Authenticating..."
authline = e.headers['www-authenticate']
authobj = re.compile(
r'''(?:\s*www-authenticate\s*:)?\s*(\w*)\s+realm=['"]([^'"]+)['"]''',
re.IGNORECASE)
matchobj = authobj.match(authline)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)
try:
handle = urllib2.urlopen(req)
except IOError, e:
print "Could not authenticate."
return ""
print "Authenticated."
thepage = handle.read()
return thepage
def parsepdf(fp):
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
pdfpass = ''
doc.initialize(pdfpass)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
outfp = MyOutWriter()
device = XMLConverter(rsrcmgr, outfp)
process_pdf(rsrcmgr, device, fp)
return outfp.get_lines()
def extractMWs(pdflines):
mws = []
for i, line in enumerate(pdflines):
if line == 'M':
if(pdflines[i+3] == 'W' and
pdflines[i+6] == ' ' and
pdflines[i+9] == '='):
try:
mws.append(float(pdflines[i+12])+0.1*float(pdflines[i+18]))
except ValueError, e:
print "Could not convert to float: %s%s" % (pdflines[i+12], pdflines[i+18])
mws.append(0.0)
return mws
zipbag = []
for url in urls:
mainpage = geturl(url, username, password)
myparser = MyParser()
myparser.parse(mainpage)
pdfs = myparser.get_pdfs()
titles = myparser.get_titles()
for i, pdf in enumerate(pdfs):
pdfsource = geturl(pdf, username, password)
fp = open(tmpfile, 'w')
fp.write(pdfsource)
fp.close()
fp = open(tmpfile, 'rb')
mws = extractMWs(parsepdf(fp))
try:
mw = mws[mwOfInterest]
except IndexError, e:
print "Could not get MW... taking 0.0 instead"
mw = 0.0
zipbag.append((titles[i], mw))
for tpl in sorted(zipbag, key=lambda x: x[1]):
print "%s: %f" % tpl