#!/usr/bin/env python '''Index the documentation and create a search page A.Mastbaum , October 2012 ''' import sys import os import argparse import json import re import unicodedata # file extensions to include in index FILE_EXTS = ('.txt', '.html') # words to not index EXCLUDE = ('a', 'the', 'also', 'and', 'are', 'but', 'for', 'from', 'than', 'that', 'their', 'then', 'there', 'these', 'this', 'was', 'were', 'what', 'where', 'which', 'who', 'whose', 'with', 'to', 'of', 'is', 'it', 'be', 'that', 'in', '-', 'can', 'you', 'as', 'by', '=', '', ' ', 'an', 'on', 'if', 'have', 'not', 'will', 'up', 'or', 'at', 'each', 'we', 'so', 'its', 'any', 'all', 'has', 'new', 'used', 'previous', 'contents', 'next', 'about', 'see', 'them', 'one', 'event', 'object', 'data', 'objects', 'type', 'set', 'function', 'should', 'using', 'only', 'use', '', 'nbsp', 'they') # characters to strip before indexing STRIP_CHARS = ''' !?.,;:`'"(){}[]&=''' # maximum number of words to show around a match for context CONTEXT = 14 def build_index(basedir, file_exts, exclude=(), strip_chars='', context=14, defang=True, verbose=False): '''Build a search index. :param basedir: Directory containing files to index :param file_exts: File extensions to include :param exclude: List of words to exclude :param strip_chars: Characters to strip before indexing :param context: Number of words of context to include :param defang: If True, strip out possibly-evil HTML type things :param verbose: If True, print out debugging output ''' if not os.path.isdir(basedir): print 'Invalid base directory', basedir sys.exit(2) exclude = map(lambda x: x.upper(), EXCLUDE) context = int(CONTEXT/2) index = {} for root, _dirs, files in os.walk(basedir): for name in filter(lambda x: x.endswith(file_exts), files): filename = os.path.join(root, name) with open(filename, 'r') as source_file: content = source_file.read() match = re.search(r'<(title|TITLE)>.*?', content) if match: title = match.group(0)[7:-8] else: title = name if defang: content = re.sub(r'<[^>]*?>', '', content) content = content.split() for i, word in enumerate(map(lambda x: x.strip().upper(), content)): word = word.strip(strip_chars) if word in exclude or len(word) < 3: continue found_strip_char = False for character in STRIP_CHARS: if character in word: found_strip_char = True break if found_strip_char: continue context_string = ' '.join(content[max(0, i-context): min(len(content), i+context)]) index.setdefault(word, []).append((filename, title, context_string,)) # print word freqency statistics if verbose: print 'Frequent keys:' for k, v in sorted(index.items(), key=lambda x: len(x[1]), reverse=True)[:50]: print k, len(v) return index def build_search(output,basedir='.',verbose=False): print "Building Search..." with open(output,'w') as index_file: index_obj= build_index(basedir, FILE_EXTS, EXCLUDE, STRIP_CHARS, CONTEXT, verbose) index_string = json.dumps(index_obj, encoding='iso-8859-1') index_string = index_string.replace('/doc', '') index_file.write('var search_index = %s;' % index_string)