'''
Index the documentation and create a search page.
'''

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import sys
import os
import json
import re

# file extensions to include in index
FILE_EXTS = ('.txt', '.html')

# words to not index
EXCLUDE = ('a', 'the', 'also', 'and', 'are', 'but', 'for', 'from', 'than',
           'that', 'their', 'then', 'there', 'these', 'this', 'was', 'were',
           'what', 'where', 'which', 'who', 'whose', 'with', 'to', 'of', 'is',
           'it', 'be', 'that', 'in', '-', 'can', 'you', 'as', 'by', '=', '',
           ' ', 'an', 'on', 'if', 'have', 'not', 'will', 'up', 'or', 'at',
           'each', 'we', 'so', 'its', 'any', 'all', 'has', 'new', 'used',
           'previous', 'contents', 'next', 'about', 'see', 'them', 'one',
           'event', 'object', 'data', 'objects', 'type', 'set', 'function',
           'should', 'using', 'only', 'use', '', 'nbsp', 'they')

# characters to strip before indexing
STRIP_CHARS = ''' !?.,;:`'"(){}[]&='''

# maximum number of words to show around a match for context
CONTEXT = 14

def build_index(basedir, file_exts, exclude=(), strip_chars='', context=14,
                defang=True, verbose=False, strip_basedir=False):
    '''
    Build a search index.

    :param basedir: Directory containing files to index
    :param file_exts: File extensions to include
    :param exclude: List of words to exclude
    :param strip_chars: Characters to strip before indexing
    :param context: Number of words of context to include
    :param defang: If True, strip out possibly-evil HTML type things
    :param verbose: If True, print out debugging output
    :param strip_basedir: If True, strip the base directory from the filenames
    '''
    if not os.path.isdir(basedir):
        print('Invalid base directory {}'.format(basedir))
        sys.exit(2)

    exclude = [x.upper() for x in EXCLUDE]
    context = int(CONTEXT/2)

    index = {}
    for root, _dirs, files in os.walk(basedir):
        for name in [x for x in files if x.endswith(file_exts)]:
            filename = os.path.join(root, name)
            with open(filename, 'r') as source_file:
                content = source_file.read()

            match = re.search(r'<(title|TITLE)>.*?</(title|TITLE)>', content)
            if match:
                title = match.group(0)[7:-8]
            else:
                title = name

            if defang:
                content = re.sub(r'<[^>]*?>', '', content)

            content = content.split()

            if strip_basedir:
                # Strip the base directory from the filename before writing.
                filename = filename.replace(basedir, "", 1)
                if filename[0] == '/':
                    filename = filename[1:]

            for i, word in enumerate([x.strip().upper() for x in content]):
                word = word.strip(strip_chars)
                if word in exclude or len(word) < 3:
                    continue

                found_strip_char = False
                for character in STRIP_CHARS:
                    if character in word:
                        found_strip_char = True
                        break
                if found_strip_char:
                    continue

                context_string = ' '.join(content[max(0, i-context):
                                                  min(len(content), i+context)])

                index.setdefault(word, []).append((filename, title, context_string,))

    # print word freqency statistics
    if verbose:
        print('Frequent keys:')
        for k, v in sorted(list(index.items()), key=lambda x: len(x[1]), reverse=True)[:50]:
            print('{} {}'.format(k, len(v)))

    return index

def build_search(output, basedir='.', verbose=False, strip_basedir=False):
    '''
    Build the search index and write the json output to a file.

    :param output: Output file to write to
    :param basedir: Directory containing files to index
    :param verbose: If True, print out debugging output
    :param strip_basedir: If True, strip the base directory from the filenames
    '''
    print("Building Search...")

    index_obj = build_index(basedir, file_exts=FILE_EXTS, exclude=EXCLUDE,
                            strip_chars=STRIP_CHARS, context=CONTEXT,
                            defang=True, verbose=verbose, strip_basedir=strip_basedir)

    # Note that encoding parameter of dumps defaults to UTF-8 in Python 2.
    # However, that parameter does not exist in Python 3.
    index_string = json.dumps(index_obj, separators=(',', ':'))

    with open(output, 'w') as index_file:
        index_file.write('var search_index = {0};'.format(index_string))