What do you get if you merge this and this?
You get a scary little filter that extract the top 5 keywords from a post content and adds them to the titles.
Useful titles for all posts: how about that?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/usr/bin/python # Copyright (C) 2007 Enrico Zini <enrico@debian.org> # This software is licensed under the therms of the GNU General Public # License, version 2 or later. import libxml2, re, math tokenize = re.compile(r"\W+") doc = libxml2.parseFile("-") root = doc.getRootElement() # Create an xpath context and register the namespaces xpc = doc.xpathNewContext() for d in root.nsDefs(): if d.name == None: xpc.xpathRegisterNs("rss", d.content) else: xpc.xpathRegisterNs(d.name, d.content) # Collect text stats to generate summaries doc_tokens = {} for x in xpc.xpathEval("/rdf:RDF/rss:item"): res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") tokens = {} for c in x.children: if c.type != 'element' or (c.name != 'encoded' and c.name != 'title'): continue for t in c.children: if t.type != 'text': continue for tok in tokenize.split(t.content): if tok in tokens: tokens[tok] += 1 else: tokens[tok] = 1 doc_tokens[res] = tokens # Aggregate token counts aggregated = {} for d in doc_tokens.itervalues(): for tok, count in d.iteritems(): tok = tok.lower() if tok in aggregated: aggregated[tok] += count else: aggregated[tok] = count def tfidf(doc, tok): "Compute TFIDF score of a token in a document" tok = tok.lower() return doc_tokens[doc].get(tok, 0) * math.log(float(len(doc_tokens)) / aggregated.get(tok, 0)) def top5(doc): # Output the top 5 tokens by TFIDF return sorted(doc_tokens[doc].keys(), key=lambda tok: tfidf(doc, tok), reverse=True)[:5] # Rewrite the titles for x in xpc.xpathEval("/rdf:RDF/rss:item"): res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") for c in x.children: if c.type != 'element' or c.name != 'title': continue for t in c.children: if t.type != 'text': continue name, content = t.content.split(":", 1) summary = ", ".join(top5(res)) t.replaceNode(doc.newDocText(name+" ["+summary+"]:"+content)) # Serialize the result print doc.saveFormatFile("-", True) |