#!/usr/bin/env python
'''Index the documentation and create a search page

A.Mastbaum <mastbaum@hep.upenn.edu>, October 2012
'''

import sys
import os
import argparse
import json
import re
import unicodedata

# file extensions to include in index
FILE_EXTS = ('.txt', '.html')

# words to not index
EXCLUDE = ('a', 'the', 'also', 'and', 'are', 'but', 'for', 'from', 'than',
           'that', 'their', 'then', 'there', 'these', 'this', 'was', 'were',
           'what', 'where', 'which', 'who', 'whose', 'with', 'to', 'of', 'is',
           'it', 'be', 'that', 'in', '-', 'can', 'you', 'as', 'by', '=', '',
           ' ', 'an', 'on', 'if', 'have', 'not', 'will', 'up', 'or', 'at',
           'each', 'we', 'so', 'its', 'any', 'all', 'has', 'new', 'used',
           'previous', 'contents', 'next', 'about', 'see', 'them', 'one',
           'event', 'object', 'data', 'objects', 'type', 'set', 'function',
           'should', 'using', 'only', 'use', '', 'nbsp', 'they')

# characters to strip before indexing
STRIP_CHARS = ''' !?.,;:`'"(){}[]&='''

# maximum number of words to show around a match for context
CONTEXT = 14


def build_index(basedir, file_exts, exclude=(), strip_chars='', context=14,
                defang=True, verbose=False):
    '''Build a search index.
    
    :param basedir: Directory containing files to index
    :param file_exts: File extensions to include
    :param exclude: List of words to exclude
    :param strip_chars: Characters to strip before indexing
    :param context: Number of words of context to include
    :param defang: If True, strip out possibly-evil HTML type things
    :param verbose: If True, print out debugging output
    '''
    if not os.path.isdir(basedir):
        print 'Invalid base directory', basedir
        sys.exit(2)

    exclude = map(lambda x: x.upper(), EXCLUDE)
    context = int(CONTEXT/2)

    index = {}
    for root, _dirs, files in os.walk(basedir):
        for name in filter(lambda x: x.endswith(file_exts), files):
            filename = os.path.join(root, name)
            with open(filename, 'r') as source_file:
                content = source_file.read()

                match = re.search(r'<(title|TITLE)>.*?</(title|TITLE)>',
                                  content)
                if match:
                    title = match.group(0)[7:-8]
                else:
                    title = name

                if defang:
                    content = re.sub(r'<[^>]*?>', '', content)

                content = content.split()

                for i, word in enumerate(map(lambda x: x.strip().upper(),
                                             content)):
                    word = word.strip(strip_chars)
                    if word in exclude or len(word) < 3:
                        continue

                    found_strip_char = False
                    for character in STRIP_CHARS:
                        if character in word:
                            found_strip_char = True
                            break
                    if found_strip_char:
                        continue

                    context_string = ' '.join(content[max(0, i-context):
                                                      min(len(content),
                                                          i+context)])

                    index.setdefault(word, []).append((filename, title,
                                                       context_string,))

    # print word freqency statistics
    if verbose:
        print 'Frequent keys:'
        for k, v in sorted(index.items(), key=lambda x: len(x[1]), reverse=True)[:50]:
            print k, len(v)

    return index


def build_search(output,basedir='.',verbose=False):
    print "Building Search..."
    with open(output,'w') as index_file:
        index_obj= build_index(basedir,  FILE_EXTS, EXCLUDE, STRIP_CHARS, 
                                  CONTEXT, verbose)
        index_string = json.dumps(index_obj, encoding='iso-8859-1')
        index_string = index_string.replace('/doc', '')
        index_file.write('var search_index = %s;' % index_string)