Automatic feed summaries

You get a scary little filter that extract the top 5 keywords from a post content and adds them to the titles.
#!/usr/bin/python

# Copyright (C) 2007 Enrico Zini <enrico@debian.org>
# This software is licensed under the therms of the GNU General Public
# License, version 2 or later.

import libxml2, re, math

tokenize = re.compile(r"\W+")

doc = libxml2.parseFile("-")
root = doc.getRootElement()

# Create an xpath context and register the namespaces
xpc = doc.xpathNewContext()
for d in root.nsDefs():
    if d.name == None:
        xpc.xpathRegisterNs("rss", d.content)
    else:
        xpc.xpathRegisterNs(d.name, d.content)

# Collect text stats to generate summaries
doc_tokens = {}
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
    res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    tokens = {}
    for c in x.children:
        if c.type != 'element' or (c.name != 'encoded' and c.name != 'title'):
            continue
        for t in c.children:
            if t.type != 'text': continue
            for tok in tokenize.split(t.content):
                if tok in tokens:
                    tokens[tok] += 1
                else:
                    tokens[tok] = 1
    doc_tokens[res] = tokens

# Aggregate token counts
aggregated = {}
for d in doc_tokens.itervalues():
    for tok, count in d.iteritems():
        tok = tok.lower()
        if tok in aggregated:
            aggregated[tok] += count
        else:
            aggregated[tok] = count

def tfidf(doc, tok):
    "Compute TFIDF score of a token in a document"
    tok = tok.lower()
    return doc_tokens[doc].get(tok, 0) * math.log(float(len(doc_tokens)) / aggregated.get(tok, 0))

def top5(doc):
    # Output the top 5 tokens by TFIDF
    return sorted(doc_tokens[doc].keys(), key=lambda tok: tfidf(doc, tok), reverse=True)[:5]


# Rewrite the titles
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
    res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    for c in x.children:
        if c.type != 'element' or c.name != 'title':
            continue
        for t in c.children:
            if t.type != 'text': continue
            name, content = t.content.split(":", 1)
            summary = ", ".join(top5(res))
            t.replaceNode(doc.newDocText(name+" ["+summary+"]:"+content))

# Serialize the result
print doc.saveFormatFile("-", True)