# -*- coding: utf-8 -*- """ sphinx.search.zh ~~~~~~~~~~~~~~~~ Chinese search language: includes routine to split words. :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import os import re from sphinx.search import SearchLanguage try: from Stemmer import Stemmer as PyStemmer PYSTEMMER = True except ImportError: from sphinx.util.stemmer import PorterStemmer PYSTEMMER = False try: import jieba JIEBA = True except ImportError: JIEBA = False english_stopwords = set(""" a and are as at be but by for if in into is it near no not of on or such that the their then there these they this to was will with """.split()) js_porter_stemmer = """ /** * Porter Stemmer */ var Stemmer = function() { var step2list = { ational: 'ate', tional: 'tion', enci: 'ence', anci: 'ance', izer: 'ize', bli: 'ble', alli: 'al', entli: 'ent', eli: 'e', ousli: 'ous', ization: 'ize', ation: 'ate', ator: 'ate', alism: 'al', iveness: 'ive', fulness: 'ful', ousness: 'ous', aliti: 'al', iviti: 'ive', biliti: 'ble', logi: 'log' }; var step3list = { icate: 'ic', ative: '', alize: 'al', iciti: 'ic', ical: 'ic', ful: '', ness: '' }; var c = "[^aeiou]"; // consonant var v = "[aeiouy]"; // vowel var C = c + "[^aeiouy]*"; // consonant sequence var V = v + "[aeiou]*"; // vowel sequence var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 var s_v = "^(" + C + ")?" + v; // vowel in stem this.stemWord = function (w) { var stem; var suffix; var firstch; var origword = w; if (w.length < 3) return w; var re; var re2; var re3; var re4; firstch = w.substr(0,1); if (firstch == "y") w = firstch.toUpperCase() + w.substr(1); // Step 1a re = /^(.+?)(ss|i)es$/; re2 = /^(.+?)([^s])s$/; if (re.test(w)) w = w.replace(re,"$1$2"); else if (re2.test(w)) w = w.replace(re2,"$1$2"); // Step 1b re = /^(.+?)eed$/; re2 = /^(.+?)(ed|ing)$/; if (re.test(w)) { var fp = re.exec(w); re = new RegExp(mgr0); if (re.test(fp[1])) { re = /.$/; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = new RegExp(s_v); if (re2.test(stem)) { w = stem; re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re2.test(w)) w = w + "e"; else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) w = w + "e"; } } // Step 1c re = /^(.+?)y$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(s_v); if (re.test(stem)) w = stem + "i"; } // Step 2 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) w = stem + step2list[suffix]; } // Step 3 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) w = stem + step3list[suffix]; } // Step 4 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\ iti|ous|ive|ize)$/; re2 = /^(.+?)(s|t)(ion)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); if (re.test(stem)) w = stem; } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = new RegExp(mgr1); if (re2.test(stem)) w = stem; } // Step 5 re = /^(.+?)e$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); re2 = new RegExp(meq1); re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) w = stem; } re = /ll$/; re2 = new RegExp(mgr1); if (re.test(w) && re2.test(w)) { re = /.$/; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") w = firstch.toLowerCase() + w.substr(1); return w; } } """ class SearchChinese(SearchLanguage): """ Chinese search implementation """ lang = 'zh' language_name = 'Chinese' js_stemmer_code = js_porter_stemmer stopwords = english_stopwords latin1_letters = re.compile(r'\w+(?u)[\u0000-\u00ff]') def init(self, options): if JIEBA: dict_path = options.get('dict') if dict_path and os.path.isfile(dict_path): jieba.set_dictionary(dict_path) if PYSTEMMER: class Stemmer(object): def __init__(self): self.stemmer = PyStemmer('porter') def stem(self, word): return self.stemmer.stemWord(word) else: class Stemmer(PorterStemmer): """All those porter stemmer implementations look hideous; make at least the stem method nicer. """ def stem(self, word): word = word.lower() return PorterStemmer.stem(self, word, 0, len(word) - 1) self.stemmer = Stemmer() def split(self, input): chinese = [] if JIEBA: chinese = list(jieba.cut_for_search(input)) latin1 = self.latin1_letters.findall(input) return chinese + latin1 def word_filter(self, stemmed_word): return len(stemmed_word) > 1 def stem(self, word): return self.stemmer.stem(word)