"""This module contains an object that implements the Paice-Husk stemming algorithm. If you just want to use the standard Paice-Husk stemming rules, use the module's ``stem()`` function:: stemmed_word = stem(word) If you want to use a custom rule set, read the rules into a string where the rules are separated by newlines, and instantiate the object with the string, then use the object's stem method to stem words:: stemmer = PaiceHuskStemmer(my_rules_string) stemmed_word = stemmer.stem(word) """ import re from collections import defaultdict class PaiceHuskStemmer(object): """Implements the Paice-Husk stemming algorithm. """ rule_expr = re.compile(r""" ^(?P\w+) (?P[*]?) (?P\d+) (?P\w*) (?P[.>]) """, re.UNICODE | re.VERBOSE) stem_expr = re.compile("^\w+", re.UNICODE) def __init__(self, ruletable): """ :param ruletable: a string containing the rule data, separated by newlines. """ self.rules = defaultdict(list) self.read_rules(ruletable) def read_rules(self, ruletable): rule_expr = self.rule_expr rules = self.rules for line in ruletable.split("\n"): line = line.strip() if not line: continue match = rule_expr.match(line) if match: ending = match.group("ending")[::-1] lastchar = ending[-1] intact = match.group("intact") == "*" num = int(match.group("num")) append = match.group("append") cont = match.group("cont") == ">" rules[lastchar].append((ending, intact, num, append, cont)) else: raise Exception("Bad rule: %r" % line) def first_vowel(self, word): vp = min([p for p in [word.find(v) for v in "aeiou"] if p > -1]) yp = word.find("y") if yp > 0 and yp < vp: return yp return vp def strip_prefix(self, word): for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"): if word.startswith(prefix): return word[len(prefix):] return word def stem(self, word): """Returns a stemmed version of the argument string. """ rules = self.rules match = self.stem_expr.match(word) if not match: return word stem = self.strip_prefix(match.group(0)) is_intact = True continuing = True while continuing: pfv = self.first_vowel(stem) rulelist = rules.get(stem[-1]) if not rulelist: break continuing = False for ending, intact, num, append, cont in rulelist: if stem.endswith(ending): if intact and not is_intact: continue newlen = len(stem) - num + len(append) if ((pfv == 0 and newlen < 2) or (pfv > 0 and newlen < 3)): # If word starts with vowel, minimum stem length is 2. # If word starts with consonant, minimum stem length is # 3. continue is_intact = False stem = stem[:0 - num] + append continuing = cont break return stem # The default rules for the Paice-Husk stemming algorithm defaultrules = """ ai*2. { -ia > - if intact } a*1. { -a > - if intact } bb1. { -bb > -b } city3s. { -ytic > -ys } ci2> { -ic > - } cn1t> { -nc > -nt } dd1. { -dd > -d } dei3y> { -ied > -y } deec2ss. { -ceed > -cess } dee1. { -eed > -ee } de2> { -ed > - } dooh4> { -hood > - } e1> { -e > - } feil1v. { -lief > -liev } fi2> { -if > - } gni3> { -ing > - } gai3y. { -iag > -y } ga2> { -ag > - } gg1. { -gg > -g } ht*2. { -th > - if intact } hsiug5ct. { -guish > -ct } hsi3> { -ish > - } i*1. { -i > - if intact } i1y> { -i > -y } ji1d. { -ij > -id -- see nois4j> & vis3j> } juf1s. { -fuj > -fus } ju1d. { -uj > -ud } jo1d. { -oj > -od } jeh1r. { -hej > -her } jrev1t. { -verj > -vert } jsim2t. { -misj > -mit } jn1d. { -nj > -nd } j1s. { -j > -s } lbaifi6. { -ifiabl > - } lbai4y. { -iabl > -y } lba3> { -abl > - } lbi3. { -ibl > - } lib2l> { -bil > -bl } lc1. { -cl > c } lufi4y. { -iful > -y } luf3> { -ful > - } lu2. { -ul > - } lai3> { -ial > - } lau3> { -ual > - } la2> { -al > - } ll1. { -ll > -l } mui3. { -ium > - } mu*2. { -um > - if intact } msi3> { -ism > - } mm1. { -mm > -m } nois4j> { -sion > -j } noix4ct. { -xion > -ct } noi3> { -ion > - } nai3> { -ian > - } na2> { -an > - } nee0. { protect -een } ne2> { -en > - } nn1. { -nn > -n } pihs4> { -ship > - } pp1. { -pp > -p } re2> { -er > - } rae0. { protect -ear } ra2. { -ar > - } ro2> { -or > - } ru2> { -ur > - } rr1. { -rr > -r } rt1> { -tr > -t } rei3y> { -ier > -y } sei3y> { -ies > -y } sis2. { -sis > -s } si2> { -is > - } ssen4> { -ness > - } ss0. { protect -ss } suo3> { -ous > - } su*2. { -us > - if intact } s*1> { -s > - if intact } s0. { -s > -s } tacilp4y. { -plicat > -ply } ta2> { -at > - } tnem4> { -ment > - } tne3> { -ent > - } tna3> { -ant > - } tpir2b. { -ript > -rib } tpro2b. { -orpt > -orb } tcud1. { -duct > -duc } tpmus2. { -sumpt > -sum } tpec2iv. { -cept > -ceiv } tulo2v. { -olut > -olv } tsis0. { protect -sist } tsi3> { -ist > - } tt1. { -tt > -t } uqi3. { -iqu > - } ugo1. { -ogu > -og } vis3j> { -siv > -j } vie0. { protect -eiv } vi2> { -iv > - } ylb1> { -bly > -bl } yli3y> { -ily > -y } ylp0. { protect -ply } yl2> { -ly > - } ygo1. { -ogy > -og } yhp1. { -phy > -ph } ymo1. { -omy > -om } ypo1. { -opy > -op } yti3> { -ity > - } yte3> { -ety > - } ytl2. { -lty > -l } yrtsi5. { -istry > - } yra3> { -ary > - } yro3> { -ory > - } yfi3. { -ify > - } ycn2t> { -ncy > -nt } yca3> { -acy > - } zi2> { -iz > - } zy1s. { -yz > -ys } """ # Make the standard rules available as a module-level function stem = PaiceHuskStemmer(defaultrules).stem