diff options
author | Daniel Veillard <veillard@src.gnome.org> | 2002-10-06 21:51:18 +0000 |
---|---|---|
committer | Daniel Veillard <veillard@src.gnome.org> | 2002-10-06 21:51:18 +0000 |
commit | 141d04ba749f93490561b8253794b99490b801eb (patch) | |
tree | 7725c29fa8fcdf1a4779a947cd9b8f78e1092364 /doc | |
parent | dc6d4abae736cfdf09dfb7ee38bafe47708d328f (diff) | |
download | android_external_libxml2-141d04ba749f93490561b8253794b99490b801eb.tar.gz android_external_libxml2-141d04ba749f93490561b8253794b99490b801eb.tar.bz2 android_external_libxml2-141d04ba749f93490561b8253794b99490b801eb.zip |
added HTML page indexing Daniel
* doc/index.py: added HTML page indexing
Daniel
Diffstat (limited to 'doc')
-rwxr-xr-x | doc/index.py | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/doc/index.py b/doc/index.py index 4e32419b..2cca7c0f 100755 --- a/doc/index.py +++ b/doc/index.py @@ -42,6 +42,13 @@ import string import os # +# We are not interested in parsing errors here +# +def callback(ctx, str): + return +libxml2.registerErrorHandler(callback, None) + +# # The dictionnary of tables required and the SQL command needed # to create them # @@ -60,6 +67,19 @@ TABLES={ KEY name (name), KEY symbol (symbol), UNIQUE KEY ID (name, symbol))""", + "wordsHTML" : """CREATE TABLE wordsHTML ( + name varchar(50) NOT NULL, + resource varchar(255) NOT NULL, + section varchar(255), + id varchar(50), + relevance int, + KEY name (name), + KEY resource (resource), + UNIQUE KEY ref (name, resource))""", + "pages" : """CREATE TABLE pages ( + resource varchar(255) NOT NULL, + title varchar(255) NOT NULL, + UNIQUE KEY name (resource))""", "Queries" : """CREATE TABLE Queries ( ID int(11) NOT NULL auto_increment, Value varchar(50) NOT NULL, @@ -237,6 +257,74 @@ def addType(name, module, desc = ""): def addFunctype(name, module, desc = ""): return updateSymbol(name, module, 'functype', desc) +def addPage(resource, title): + global DB + + if DB == None: + openMySQL() + if DB == None: + return -1 + if resource == None: + return -1 + + c = DB.cursor() + try: + ret = c.execute( + """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" % + (resource, title)) + except: + try: + ret = c.execute( + """UPDATE pages SET title='%s' WHERE resource='%s'""" % + (title, resource)) + except: + print "Update symbol (%s, %s, %s) failed command" % (name, module, type) + print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource) + print sys.exc_type, sys.exc_value + return -1 + + return ret + +def updateWordHTML(name, resource, desc, id, relevance): + global DB + + if DB == None: + openMySQL() + if DB == None: + return -1 + if name == None: + return -1 + if resource == None: + return -1 + if id == None: + id = "" + if desc == None: + desc = "" + else: + try: + desc = string.replace(desc, "'", " ") + desc = desc[0:99] + except: + desc = "" + + c = DB.cursor() + try: + ret = c.execute( +"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" % + (name, resource, desc, id, relevance)) + except: + try: + ret = c.execute( +"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % + (desc, id, relevance, name, resource)) + except: + print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance) + print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource) + print sys.exc_type, sys.exc_value + return -1 + + return ret + ######################################################################### # # # Word dictionnary and analysis routines # @@ -244,6 +332,7 @@ def addFunctype(name, module, desc = ""): ######################################################################### wordsDict = {} +wordsDictHTML = {} def splitIdentifier(str): ret = [] @@ -303,6 +392,65 @@ def addString(str, module, symbol, relevance): return ret +def addWordHTML(word, resource, id, section, relevance): + global wordsDictHTML + + if word == None or len(word) < 3: + return -1 + if resource == None or section == None: + return -1 + + if wordsDictHTML.has_key(word): + d = wordsDictHTML[word] + if d == None: + return 0 + if len(d) > 15: + wordsDictHTML[word] = None + return 0 + try: + (r,i,s) = d[resource] + if i != None: + id = i + if s != None: + section = s + relevance = relevance + r + except: + pass + else: + wordsDictHTML[word] = {} + wordsDictHTML[word][resource] = (relevance, id, section) + return relevance + +def addStringHTML(str, resource, id, section, relevance): + if str == None or len(str) < 3: + return -1 + ret = 0 + str = string.replace(str, ".", " ") + str = string.replace(str, ",", " ") + str = string.replace(str, "'", " ") + str = string.replace(str, '"', " ") + str = string.replace(str, ";", " ") + str = string.replace(str, "-", " ") + str = string.replace(str, "(", " ") + str = string.replace(str, ")", " ") + str = string.replace(str, "{", " ") + str = string.replace(str, "}", " ") + str = string.replace(str, "<", " ") + str = string.replace(str, ">", " ") + str = string.replace(str, "/", " ") + str = string.replace(str, "*", " ") + str = string.replace(str, ":", " ") + str = string.replace(str, "\n", " ") + str = string.replace(str, "\r", " ") + str = string.replace(str, "\xc2", " ") + str = string.replace(str, "\xa0", " ") + l = string.split(str) + for word in l: + if len(word) > 2: + ret = ret + addWordHTML(word, resource, id, section, relevance) + + return ret + ######################################################################### # # @@ -563,6 +711,83 @@ def analyzeAPI(doc): ######################################################################### # # +# Web pages parsing and analysis # +# # +######################################################################### + +import glob + +def analyzeHTMLPara(doc, resource, p, section, id): + words = 0 + try: + content = p.content + words = words + addStringHTML(content, resource, id, section, 5) + except: + return -1 + return words + +def analyzeHTMLPre(doc, resource, p, section, id): + words = 0 + try: + content = p.content + words = words + addStringHTML(content, resource, id, section, 5) + except: + return -1 + return words + +def analyzeHTML(doc, resource): + para = 0; + ctxt = doc.xpathNewContext() + try: + res = ctxt.xpathEval("//head/title") + title = res[0].content + except: + title = "Page %s" % (resource) + addPage(resource, title) + try: + items = ctxt.xpathEval("//h1 | //h2 | //h3 | //p | //pre") + section = title + id = "" + for item in items: + if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': + section = item.content + if item.prop("id"): + id = item.prop("id") + elif item.prop("name"): + id = item.prop("name") + elif item.name == 'p': + analyzeHTMLPara(doc, resource, item, section, id) + para = para + 1 + elif item.name == 'pre': + analyzeHTMLPre(doc, resource, item, section, id) + para = para + 1 + else: + print "Page %s, unexpected %s element" % (resource, item.name) + except: + print "Page %s: problem analyzing" % (resource) + print sys.exc_type, sys.exc_value + + return para + +def analyzeHTMLPages(): + ret = 0 + HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") + for html in HTMLfiles: + if html[0:3] == "API": + continue + if html == "xml.html": + continue + try: + doc = libxml2.htmlParseFile(html, None) + res = analyzeHTML(doc, html) + print "Parsed %s : %d paragraphs" % (html, res) + ret = ret + 1 + except: + print "could not parse %s" % (html) + return ret + +######################################################################### +# # # Main code: open the DB, the API XML and analyze it # # # ######################################################################### @@ -573,6 +798,23 @@ except: print sys.exc_type, sys.exc_value sys.exit(1) +ret = analyzeHTMLPages() +print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) + +i = 0 +skipped = 0 +for word in wordsDictHTML.keys(): + refs = wordsDictHTML[word] + if refs == None: + skipped = skipped + 1 + continue; + for resource in refs.keys(): + (relevance, id, section) = refs[resource] + updateWordHTML(word, resource, section, id, relevance) + i = i + 1 + +print "Found %d associations in HTML pages" % (i) + try: doc = loadAPI(API) ret = analyzeAPI(doc) |