''' Index the documentation and create a search page. ''' from __future__ import print_function from __future__ import absolute_import from __future__ import division import sys import os import json import re # file extensions to include in index FILE_EXTS = ('.txt', '.html') # words to not index EXCLUDE = ('a', 'the', 'also', 'and', 'are', 'but', 'for', 'from', 'than', 'that', 'their', 'then', 'there', 'these', 'this', 'was', 'were', 'what', 'where', 'which', 'who', 'whose', 'with', 'to', 'of', 'is', 'it', 'be', 'that', 'in', '-', 'can', 'you', 'as', 'by', '=', '', ' ', 'an', 'on', 'if', 'have', 'not', 'will', 'up', 'or', 'at', 'each', 'we', 'so', 'its', 'any', 'all', 'has', 'new', 'used', 'previous', 'contents', 'next', 'about', 'see', 'them', 'one', 'event', 'object', 'data', 'objects', 'type', 'set', 'function', 'should', 'using', 'only', 'use', '', 'nbsp', 'they') # characters to strip before indexing STRIP_CHARS = ''' !?.,;:`'"(){}[]&=''' # maximum number of words to show around a match for context CONTEXT = 14 def build_index(basedir, file_exts, exclude=(), strip_chars='', context=14, defang=True, verbose=False, strip_basedir=False): ''' Build a search index. :param basedir: Directory containing files to index :param file_exts: File extensions to include :param exclude: List of words to exclude :param strip_chars: Characters to strip before indexing :param context: Number of words of context to include :param defang: If True, strip out possibly-evil HTML type things :param verbose: If True, print out debugging output :param strip_basedir: If True, strip the base directory from the filenames ''' if not os.path.isdir(basedir): print('Invalid base directory {}'.format(basedir)) sys.exit(2) exclude = [x.upper() for x in EXCLUDE] context = int(CONTEXT/2) index = {} for root, _dirs, files in os.walk(basedir): for name in [x for x in files if x.endswith(file_exts)]: filename = os.path.join(root, name) with open(filename, 'r') as source_file: content = source_file.read() match = re.search(r'<(title|TITLE)>.*?', content) if match: title = match.group(0)[7:-8] else: title = name if defang: content = re.sub(r'<[^>]*?>', '', content) content = content.split() if strip_basedir: # Strip the base directory from the filename before writing. filename = filename.replace(basedir, "", 1) if filename[0] == '/': filename = filename[1:] for i, word in enumerate([x.strip().upper() for x in content]): word = word.strip(strip_chars) if word in exclude or len(word) < 3: continue found_strip_char = False for character in STRIP_CHARS: if character in word: found_strip_char = True break if found_strip_char: continue context_string = ' '.join(content[max(0, i-context): min(len(content), i+context)]) index.setdefault(word, []).append((filename, title, context_string,)) # print word freqency statistics if verbose: print('Frequent keys:') for k, v in sorted(list(index.items()), key=lambda x: len(x[1]), reverse=True)[:50]: print('{} {}'.format(k, len(v))) return index def build_search(output, basedir='.', verbose=False, strip_basedir=False): ''' Build the search index and write the json output to a file. :param output: Output file to write to :param basedir: Directory containing files to index :param verbose: If True, print out debugging output :param strip_basedir: If True, strip the base directory from the filenames ''' print("Building Search...") index_obj = build_index(basedir, file_exts=FILE_EXTS, exclude=EXCLUDE, strip_chars=STRIP_CHARS, context=CONTEXT, defang=True, verbose=verbose, strip_basedir=strip_basedir) # Note that encoding parameter of dumps defaults to UTF-8 in Python 2. # However, that parameter does not exist in Python 3. index_string = json.dumps(index_obj, separators=(',', ':')) with open(output, 'w') as index_file: index_file.write('var search_index = {0};'.format(index_string))