# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.analysis.acore import Composable, CompositionError from whoosh.analysis.tokenizers import Tokenizer from whoosh.analysis.filters import LowercaseFilter from whoosh.analysis.filters import StopFilter, STOP_WORDS from whoosh.analysis.morph import StemFilter from whoosh.analysis.intraword import IntraWordFilter from whoosh.analysis.tokenizers import default_pattern from whoosh.analysis.tokenizers import CommaSeparatedTokenizer from whoosh.analysis.tokenizers import IDTokenizer from whoosh.analysis.tokenizers import RegexTokenizer from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer from whoosh.lang.porter import stem # Analyzers class Analyzer(Composable): """ Abstract base class for analyzers. """ def __repr__(self): return "%s()" % self.__class__.__name__ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__) def __call__(self, value, **kwargs): raise NotImplementedError def clean(self): pass class CompositeAnalyzer(Analyzer): def __init__(self, *composables): self.items = [] for comp in composables: if isinstance(comp, CompositeAnalyzer): self.items.extend(comp.items) else: self.items.append(comp) # Tokenizers must start a chain, and then only filters after that # (because analyzers take a string and return a generator of tokens, # and filters take and return generators of tokens) for item in self.items[1:]: if isinstance(item, Tokenizer): raise CompositionError("Only one tokenizer allowed at the start" " of the analyzer: %r" % self.items) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(item) for item in self.items)) def __call__(self, value, no_morph=False, **kwargs): items = self.items # Start with tokenizer gen = items[0](value, **kwargs) # Run filters for item in items[1:]: if not (no_morph and hasattr(item, "is_morph") and item.is_morph): gen = item(gen) return gen def __getitem__(self, item): return self.items.__getitem__(item) def __len__(self): return len(self.items) def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.items == other.items) def clean(self): for item in self.items: if hasattr(item, "clean"): item.clean() def has_morph(self): return any(item.is_morph for item in self.items) # Functions that return composed analyzers def IDAnalyzer(lowercase=False): """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if desired. """ tokenizer = IDTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer def KeywordAnalyzer(lowercase=False, commas=False): """Parses whitespace- or comma-separated tokens. >>> ana = KeywordAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["Hello", "there,", "this", "is", "a", "TEST"] :param lowercase: whether to lowercase the tokens. :param commas: if True, items are separated by commas rather than whitespace. """ if commas: tokenizer = CommaSeparatedTokenizer() else: tokenizer = SpaceSeparatedTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False): """Deprecated, just use a RegexTokenizer directly. """ return RegexTokenizer(expression=expression, gaps=gaps) def SimpleAnalyzer(expression=default_pattern, gaps=False): """Composes a RegexTokenizer with a LowercaseFilter. >>> ana = SimpleAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["hello", "there", "this", "is", "a", "test"] :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False): """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter. >>> ana = StandardAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["testing", "testing", "testing"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. >>> ana = StemmingAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param ignore: a set of words to not stem. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize) def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=True, splitwords=True, splitnums=True, mergewords=False, mergenums=False): """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and StopFilter. >>> ana = FancyAnalyzer() >>> [token.text for token in ana("Should I call getInt or get_real?")] ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ return (RegexTokenizer(expression=expression, gaps=gaps) | IntraWordFilter(splitwords=splitwords, splitnums=splitnums, mergewords=mergewords, mergenums=mergenums) | LowercaseFilter() | StopFilter(stoplist=stoplist, minsize=minsize) ) def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): """Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` and :func:`whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. """ from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain chain = (RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()) # Add a stop word filter try: chain = chain | StopFilter(lang=lang) except NoStopWords: pass # Add a stemming filter try: chain = chain | StemFilter(lang=lang, cachesize=cachesize) except NoStemmer: pass return chain