#!/usr/bin/python
# Copyright (C) 2007 Enrico Zini <enrico@debian.org>
# This software is licensed under the therms of the GNU General Public
# License, version 2 or later.
import libxml2, re, math
tokenize = re.compile(r"\W+")
doc = libxml2.parseFile("-")
root = doc.getRootElement()
# Create an xpath context and register the namespaces
xpc = doc.xpathNewContext()
for d in root.nsDefs():
if d.name == None:
xpc.xpathRegisterNs("rss", d.content)
else:
xpc.xpathRegisterNs(d.name, d.content)
# Collect text stats to generate summaries
doc_tokens = {}
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
tokens = {}
for c in x.children:
if c.type != 'element' or (c.name != 'encoded' and c.name != 'title'):
continue
for t in c.children:
if t.type != 'text': continue
for tok in tokenize.split(t.content):
if tok in tokens:
tokens[tok] += 1
else:
tokens[tok] = 1
doc_tokens[res] = tokens
# Aggregate token counts
aggregated = {}
for d in doc_tokens.itervalues():
for tok, count in d.iteritems():
tok = tok.lower()
if tok in aggregated:
aggregated[tok] += count
else:
aggregated[tok] = count
def tfidf(doc, tok):
"Compute TFIDF score of a token in a document"
tok = tok.lower()
return doc_tokens[doc].get(tok, 0) * math.log(float(len(doc_tokens)) / aggregated.get(tok, 0))
def top5(doc):
# Output the top 5 tokens by TFIDF
return sorted(doc_tokens[doc].keys(), key=lambda tok: tfidf(doc, tok), reverse=True)[:5]
# Rewrite the titles
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
for c in x.children:
if c.type != 'element' or c.name != 'title':
continue
for t in c.children:
if t.type != 'text': continue
name, content = t.content.split(":", 1)
summary = ", ".join(top5(res))
t.replaceNode(doc.newDocText(name+" ["+summary+"]:"+content))
# Serialize the result
print doc.saveFormatFile("-", True)