# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.automata.fsa import ANY, EPSILON, NFA # Constants for glob _LIT = 0 _STAR = 1 _PLUS = 2 _QUEST = 3 _RANGE = 4 def parse_glob(pattern, _glob_multi="*", _glob_single="?", _glob_range1="[", _glob_range2="]"): pos = 0 last = None while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_multi: # * # (Ignore more than one star in a row) if last is not _STAR: yield _STAR, None last = _STAR elif char == _glob_single: # ? # (Ignore ? after a star) if last is not _STAR: yield _QUEST, None last = _QUEST elif char == _glob_range1: # [ chars = set() negate = False # Take the char range specification until the ] while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_range2: break chars.add(char) if chars: yield _RANGE, (chars, negate) last = _RANGE else: yield _LIT, char last = _LIT def glob_automaton(pattern): nfa = NFA(0) i = -1 for i, (op, arg) in enumerate(parse_glob(pattern)): if op is _LIT: nfa.add_transition(i, arg, i + 1) elif op is _STAR: nfa.add_transition(i, ANY, i + 1) nfa.add_transition(i, EPSILON, i + 1) nfa.add_transition(i + 1, EPSILON, i) elif op is _QUEST: nfa.add_transition(i, ANY, i + 1) elif op is _RANGE: for char in arg[0]: nfa.add_transition(i, char, i + 1) nfa.add_final_state(i + 1) return nfa