[Checkins] SVN: topia.postag/trunk/ * Make filter customization easier. Provided permissiveFilter.

Stephan Richter srichter at gmail.com
Sat May 30 11:16:11 EDT 2009


Log message for revision 100552:
  * Make filter customization easier. Provided permissiveFilter.
  
  * Add another rule that looks for the verb after a modal verb, 
    eliminating several false noun detections.
  
  * Ensured a few more test cases are working correctly.
  
  * Added documentation for keyword extraction.
  

Changed:
  _U  topia.postag/trunk/
  _U  topia.postag/trunk/src/
  U   topia.postag/trunk/src/topia/postag/README.txt
  U   topia.postag/trunk/src/topia/postag/data/english-lexicon.txt
  U   topia.postag/trunk/src/topia/postag/example.txt
  U   topia.postag/trunk/src/topia/postag/extract.py
  U   topia.postag/trunk/src/topia/postag/tag.py
  U   topia.postag/trunk/src/topia/postag/tests.py

-=-

Property changes on: topia.postag/trunk
___________________________________________________________________
Added: svn:ignore
   + .installed.cfg
bin
develop-eggs
dist
parts



Property changes on: topia.postag/trunk/src
___________________________________________________________________
Added: svn:ignore
   + topia.postag.egg-info


Modified: topia.postag/trunk/src/topia/postag/README.txt
===================================================================
--- topia.postag/trunk/src/topia/postag/README.txt	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/README.txt	2009-05-30 15:16:11 UTC (rev 100552)
@@ -128,9 +128,31 @@
    ['examples', 'NNS', 'example'],
    ['.', '.', '.']]
 
-So far so good. Let's now test the phase 2 rules.
+So far so good. Let's test a few more cases:
 
+  >>> tagger("The fox's tail is red.")
+  [['The', 'DT', 'The'],
+   ['fox', 'NN', 'fox'],
+   ["'s", 'POS', "'s"],
+   ['tail', 'NN', 'tail'],
+   ['is', 'VBZ', 'is'],
+   ['red', 'JJ', 'red'],
+   ['.', '.', '.']]
 
+  >>> tagger("The fox can't really jump over the fox's tail.")
+  [['The', 'DT', 'The'],
+   ['fox', 'NN', 'fox'],
+   ['can', 'MD', 'can'],
+   ["'t", 'RB', "'t"],
+   ['really', 'RB', 'really'],
+   ['jump', 'VB', 'jump'],
+   ['over', 'IN', 'over'],
+   ['the', 'DT', 'the'],
+   ['fox', 'NN', 'fox'],
+   ["'s", 'POS', "'s"],
+   ['tail', 'NN', 'tail'],
+   ['.', '.', '.']]
+
 Rules
 ~~~~~
 
@@ -165,3 +187,59 @@
     [['men', 'NNS', 'men']]
     >>> tagger('feet')
     [['feet', 'NNS', 'feet']]
+
+
+Keywordword Extraction
+----------------------
+
+Now that we can tag a text, let's have a look at the keyword extractions.
+
+  >>> from topia.postag import extract
+  >>> extractor = extract.KeywordExtractor()
+  >>> extractor
+  <KeywordExtractor using <Tagger for english>>
+
+As you can see, the extractor maintains a tagger:
+
+  >>> extractor.tagger
+  <Tagger for english>
+
+When creating an extractor, you can also pass in a tagger to avoid frequent
+tagger initialization:
+
+  >>> extractor = extract.KeywordExtractor(tagger)
+  >>> extractor.tagger is tagger
+  True
+
+Let's get the keywords for a simple text.
+
+  >>> extractor("The fox can't jump over the fox's tail.")
+  []
+
+We got no keywords. That's because by default at least 3 occurences of a
+keyword must be detected, if the keyword consists of a single word.
+
+The extractor maintains a filter component. Let's register the trivial
+permissive filter, which simply return everything that the extractor suggests:
+
+  >>> extractor.filter = extract.permissiveFilter
+  >>> extractor("The fox can't jump over the fox's tail.")
+  [('tail', 1, 1), ('fox', 2, 1)]
+
+But let's look at the default filter again, since it allows tweaking its
+parameters:
+
+  >>> extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)
+  >>> extractor("The fox can't jump over the fox's tail.")
+  [('fox', 2, 1)]
+
+Let's now have a look at multi-word keywords. Oftentimes multi-word nouns and
+proper names occur only once or twice in a text. But they are often great
+keywords! To handle this scenario, the concept of "strength" was
+introduced. Currently the strength is simply the amount of words in the
+keyword/term. By default, all keywords with a strength larger than 1 are
+selected regardless of the number of occurances.
+
+  >>> extractor('The German consul of Boston resides in Newton.')
+  [('German consul', 1, 2)]
+

Modified: topia.postag/trunk/src/topia/postag/data/english-lexicon.txt
===================================================================
--- topia.postag/trunk/src/topia/postag/data/english-lexicon.txt	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/data/english-lexicon.txt	2009-05-30 15:16:11 UTC (rev 100552)
@@ -17,6 +17,7 @@
 } )
 # #
 $ $
+'t RB
 Prizm NNP
 shakeup NN
 Laurance NNP

Modified: topia.postag/trunk/src/topia/postag/example.txt
===================================================================
--- topia.postag/trunk/src/topia/postag/example.txt	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/example.txt	2009-05-30 15:16:11 UTC (rev 100552)
@@ -362,263 +362,263 @@
 
 Let's look at the result of the tagger first:
 
-  >>> extractor.tagger(text)
-  [['police', 'NN', 'police'],
-   ['shut', 'VBN', 'shut'],
-   ['Palestinian', 'JJ', 'Palestinian'],
-   ['theatre', 'NN', 'theatre'],
-   ['in', 'IN', 'in'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['.', '.', '.'],
-   ['Israeli', 'JJ', 'Israeli'],
-   ['police', 'NN', 'police'],
-   ['have', 'VBP', 'have'],
-   ['shut', 'VBN', 'shut'],
-   ['down', 'RB', 'down'],
-   ['a', 'DT', 'a'],
-   ['Palestinian', 'JJ', 'Palestinian'],
-   ['theatre', 'NN', 'theatre'],
-   ['in', 'IN', 'in'],
-   ['East', 'NNP', 'East'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['.', '.', '.'],
-   ['The', 'DT', 'The'],
-   ['action', 'NN', 'action'],
-   [',', ',', ','],
-   ['on', 'IN', 'on'],
-   ['Thursday', 'NNP', 'Thursday'],
-   [',', ',', ','],
-   ['prevented', 'VBN', 'prevented'],
-   ['the', 'DT', 'the'],
-   ['closing', 'VBG', 'closing'],
-   ['event', 'NN', 'event'],
-   ['of', 'IN', 'of'],
-   ['an', 'DT', 'an'],
-   ['international', 'JJ', 'international'],
-   ['literature', 'NN', 'literature'],
-   ['festival', 'NN', 'festival'],
-   ['from', 'IN', 'from'],
-   ['taking', 'VBG', 'taking'],
-   ['place', 'NN', 'place'],
-   ['.', '.', '.'],
-   ['police', 'NN', 'police'],
-   ['said', 'VBD', 'said'],
-   ['they', 'PRP', 'they'],
-   ['were', 'VBD', 'were'],
-   ['acting', 'VBG', 'acting'],
-   ['on', 'IN', 'on'],
-   ['a', 'DT', 'a'],
-   ['court', 'NN', 'court'],
-   ['order', 'NN', 'order'],
-   [',', ',', ','],
-   ['issued', 'VBN', 'issued'],
-   ['after', 'IN', 'after'],
-   ['intelligence', 'NN', 'intelligence'],
-   ['indicated', 'VBD', 'indicated'],
-   ['that', 'IN', 'that'],
-   ['the', 'DT', 'the'],
-   ['Palestinian', 'JJ', 'Palestinian'],
-   ['Authority', 'NNP', 'Authority'],
-   ['was', 'VBD', 'was'],
-   ['involved', 'VBN', 'involved'],
-   ['in', 'IN', 'in'],
-   ['the', 'DT', 'the'],
-   ['event', 'NN', 'event'],
-   ['.', '.', '.'],
-   ['Israel', 'NNP', 'Israel'],
-   ['has', 'VBZ', 'has'],
-   ['occupied', 'VBN', 'occupied'],
-   ['East', 'NNP', 'East'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['since', 'IN', 'since'],
-   ['1967', 'NN', '1967'],
-   ['and', 'CC', 'and'],
-   ['has', 'VBZ', 'has'],
-   ['annexed', 'VBD', 'annexed'],
-   ['the', 'DT', 'the'],
-   ['area', 'NN', 'area'],
-   ['.', '.', '.'],
-   ['This', 'DT', 'This'],
-   ['is', 'VBZ', 'is'],
-   ['not', 'RB', 'not'],
-   ['recognised', 'VBD', 'recognised'],
-   ['by', 'IN', 'by'],
-   ['the', 'DT', 'the'],
-   ['international', 'JJ', 'international'],
-   ['community', 'NN', 'community'],
-   ['.', '.', '.'],
-   ['The', 'DT', 'The'],
-   ['British', 'JJ', 'British'],
-   ['consul-general', 'NN', 'consul-general'],
-   ['in', 'IN', 'in'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   [',', ',', ','],
-   ['Richard', 'NNP', 'Richard'],
-   ['Makepeace', 'NNP', 'Makepeace'],
-   [',', ',', ','],
-   ['was', 'VBD', 'was'],
-   ['attending', 'VBG', 'attending'],
-   ['the', 'DT', 'the'],
-   ['event', 'NN', 'event'],
-   ['.', '.', '.'],
-   ['"', '"', '"'],
-   ['I', 'PRP', 'I'],
-   ['think', 'VBP', 'think'],
-   ['all', 'DT', 'all'],
-   ['lovers', 'NNS', 'lover'],
-   ['of', 'IN', 'of'],
-   ['literature', 'NN', 'literature'],
-   ['would', 'MD', 'would'],
-   ['regard', 'NN', 'regard'],
-   ['this', 'DT', 'this'],
-   ['as', 'IN', 'as'],
-   ['a', 'DT', 'a'],
-   ['very', 'RB', 'very'],
-   ['regrettable', 'JJ', 'regrettable'],
-   ['moment', 'NN', 'moment'],
-   ['and', 'CC', 'and'],
-   ['regrettable', 'JJ', 'regrettable'],
-   ['decision', 'NN', 'decision'],
-   [',"', ',', ',"'],
-   ['he', 'PRP', 'he'],
-   ['added', 'VBD', 'added'],
-   ['.', '.', '.'],
-   ['Mr', 'NNP', 'Mr'],
-   ['Makepeace', 'NNP', 'Makepeace'],
-   ['said', 'VBD', 'said'],
-   ['the', 'DT', 'the'],
-   ['festival', 'NN', 'festival'],
-   ["'s", 'POS', "'s"],
-   ['closing', 'VBG', 'closing'],
-   ['event', 'NN', 'event'],
-   ['would', 'MD', 'would'],
-   ['be', 'VB', 'be'],
-   ['reorganised', 'NN', 'reorganised'],
-   ['to', 'TO', 'to'],
-   ['take', 'VB', 'take'],
-   ['place', 'NN', 'place'],
-   ['at', 'IN', 'at'],
-   ['the', 'DT', 'the'],
-   ['British', 'JJ', 'British'],
-   ['Council', 'NNP', 'Council'],
-   ['in', 'IN', 'in'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['.', '.', '.'],
-   ['The', 'DT', 'The'],
-   ['Israeli', 'JJ', 'Israeli'],
-   ['authorities', 'NNS', 'authority'],
-   ['often', 'RB', 'often'],
-   ['take', 'VB', 'take'],
-   ['action', 'NN', 'action'],
-   ['against', 'IN', 'against'],
-   ['events', 'NNS', 'event'],
-   ['in', 'IN', 'in'],
-   ['East', 'NNP', 'East'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['they', 'PRP', 'they'],
-   ['see', 'VB', 'see'],
-   ['as', 'IN', 'as'],
-   ['connected', 'VBN', 'connected'],
-   ['to', 'TO', 'to'],
-   ['the', 'DT', 'the'],
-   ['Palestinian', 'JJ', 'Palestinian'],
-   ['Authority', 'NNP', 'Authority'],
-   ['.', '.', '.'],
-   ['Saturday', 'NNP', 'Saturday'],
-   ["'s", 'POS', "'s"],
-   ['opening', 'NN', 'opening'],
-   ['event', 'NN', 'event'],
-   ['at', 'IN', 'at'],
-   ['the', 'DT', 'the'],
-   ['same', 'JJ', 'same'],
-   ['theatre', 'NN', 'theatre'],
-   ['was', 'VBD', 'was'],
-   ['also', 'RB', 'also'],
-   ['shut', 'VBN', 'shut'],
-   ['down', 'RB', 'down'],
-   ['.', '.', '.'],
-   ['A', 'DT', 'A'],
-   ['police', 'NN', 'police'],
-   ['notice', 'NN', 'notice'],
-   ['said', 'VBD', 'said'],
-   ['the', 'DT', 'the'],
-   ['closure', 'NN', 'closure'],
-   ['was', 'VBD', 'was'],
-   ['on', 'IN', 'on'],
-   ['the', 'DT', 'the'],
-   ['orders', 'NNS', 'order'],
-   ['of', 'IN', 'of'],
-   ['Israel', 'NNP', 'Israel'],
-   ["'s", 'POS', "'s"],
-   ['internal', 'JJ', 'internal'],
-   ['security', 'NN', 'security'],
-   ['minister', 'NN', 'minister'],
-   ['on', 'IN', 'on'],
-   ['the', 'DT', 'the'],
-   ['grounds', 'NNS', 'ground'],
-   ['of', 'IN', 'of'],
-   ['a', 'DT', 'a'],
-   ['breach', 'NN', 'breach'],
-   ['of', 'IN', 'of'],
-   ['interim', 'JJ', 'interim'],
-   ['peace', 'NN', 'peace'],
-   ['accords', 'NNS', 'accord'],
-   ['from', 'IN', 'from'],
-   ['the', 'DT', 'the'],
-   ['1990', 'NN', '1990'],
-   ['s', 'PRP', 's'],
-   ['.', '.', '.'],
-   ['These', 'DT', 'These'],
-   ['laid', 'VBN', 'laid'],
-   ['the', 'DT', 'the'],
-   ['framework', 'NN', 'framework'],
-   ['for', 'IN', 'for'],
-   ['talks', 'NNS', 'talk'],
-   ['on', 'IN', 'on'],
-   ['establishing', 'VBG', 'establishing'],
-   ['a', 'DT', 'a'],
-   ['Palestinian', 'JJ', 'Palestinian'],
-   ['state', 'NN', 'state'],
-   ['alongside', 'IN', 'alongside'],
-   ['Israel', 'NNP', 'Israel'],
-   [',', ',', ','],
-   ['but', 'CC', 'but'],
-   ['left', 'VBN', 'left'],
-   ['the', 'DT', 'the'],
-   ['status', 'NN', 'status'],
-   ['of', 'IN', 'of'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['to', 'TO', 'to'],
-   ['be', 'VB', 'be'],
-   ['determined', 'VBN', 'determined'],
-   ['by', 'IN', 'by'],
-   ['further', 'JJ', 'further'],
-   ['negotiation', 'NN', 'negotiation'],
-   ['.', '.', '.'],
-   ['Israel', 'NNP', 'Israel'],
-   ['has', 'VBZ', 'has'],
-   ['annexed', 'VBD', 'annexed'],
-   ['East', 'NNP', 'East'],
-   ['Jerusalem', 'NNP', 'Jerusalem'],
-   ['and', 'CC', 'and'],
-   ['declares', 'VBZ', 'declares'],
-   ['it', 'PRP', 'it'],
-   ['part', 'NN', 'part'],
-   ['of', 'IN', 'of'],
-   ['its', 'PRP$', 'its'],
-   ['eternal', 'JJ', 'eternal'],
-   ['capital', 'NN', 'capital'],
-   ['.', '.', '.'],
-   ['Palestinians', 'NNPS', 'Palestinian'],
-   ['hope', 'NN', 'hope'],
-   ['to', 'TO', 'to'],
-   ['establish', 'VB', 'establish'],
-   ['their', 'PRP$', 'their'],
-   ['capital', 'NN', 'capital'],
-   ['in', 'IN', 'in'],
-   ['the', 'DT', 'the'],
-   ['area', 'NN', 'area'],
-   ['.', '.', '.']]
+  >>> printTaggedTerms(extractor.tagger(text)) #doctest: +REPORT_NDIFF
+  police          NN    police
+  shut            VBN   shut
+  Palestinian     JJ    Palestinian
+  theatre         NN    theatre
+  in              IN    in
+  Jerusalem       NNP   Jerusalem
+  .               .     .
+  Israeli         JJ    Israeli
+  police          NN    police
+  have            VBP   have
+  shut            VBN   shut
+  down            RB    down
+  a               DT    a
+  Palestinian     JJ    Palestinian
+  theatre         NN    theatre
+  in              IN    in
+  East            NNP   East
+  Jerusalem       NNP   Jerusalem
+  .               .     .
+  The             DT    The
+  action          NN    action
+  ,               ,     ,
+  on              IN    on
+  Thursday        NNP   Thursday
+  ,               ,     ,
+  prevented       VBN   prevented
+  the             DT    the
+  closing         VBG   closing
+  event           NN    event
+  of              IN    of
+  an              DT    an
+  international   JJ    international
+  literature      NN    literature
+  festival        NN    festival
+  from            IN    from
+  taking          VBG   taking
+  place           NN    place
+  .               .     .
+  police          NN    police
+  said            VBD   said
+  they            PRP   they
+  were            VBD   were
+  acting          VBG   acting
+  on              IN    on
+  a               DT    a
+  court           NN    court
+  order           NN    order
+  ,               ,     ,
+  issued          VBN   issued
+  after           IN    after
+  intelligence    NN    intelligence
+  indicated       VBD   indicated
+  that            IN    that
+  the             DT    the
+  Palestinian     JJ    Palestinian
+  Authority       NNP   Authority
+  was             VBD   was
+  involved        VBN   involved
+  in              IN    in
+  the             DT    the
+  event           NN    event
+  .               .     .
+  Israel          NNP   Israel
+  has             VBZ   has
+  occupied        VBN   occupied
+  East            NNP   East
+  Jerusalem       NNP   Jerusalem
+  since           IN    since
+  1967            NN    1967
+  and             CC    and
+  has             VBZ   has
+  annexed         VBD   annexed
+  the             DT    the
+  area            NN    area
+  .               .     .
+  This            DT    This
+  is              VBZ   is
+  not             RB    not
+  recognised      VBD   recognised
+  by              IN    by
+  the             DT    the
+  international   JJ    international
+  community       NN    community
+  .               .     .
+  The             DT    The
+  British         JJ    British
+  consul-general  NN    consul-general
+  in              IN    in
+  Jerusalem       NNP   Jerusalem
+  ,               ,     ,
+  Richard         NNP   Richard
+  Makepeace       NNP   Makepeace
+  ,               ,     ,
+  was             VBD   was
+  attending       VBG   attending
+  the             DT    the
+  event           NN    event
+  .               .     .
+  "               "     "
+  I               PRP   I
+  think           VBP   think
+  all             DT    all
+  lovers          NNS   lover
+  of              IN    of
+  literature      NN    literature
+  would           MD    would
+  regard          VB    regard
+  this            DT    this
+  as              IN    as
+  a               DT    a
+  very            RB    very
+  regrettable     JJ    regrettable
+  moment          NN    moment
+  and             CC    and
+  regrettable     JJ    regrettable
+  decision        NN    decision
+  ,"              ,     ,"
+  he              PRP   he
+  added           VBD   added
+  .               .     .
+  Mr              NNP   Mr
+  Makepeace       NNP   Makepeace
+  said            VBD   said
+  the             DT    the
+  festival        NN    festival
+  's              POS   's
+  closing         VBG   closing
+  event           NN    event
+  would           MD    would
+  be              VB    be
+  reorganised     NN    reorganised
+  to              TO    to
+  take            VB    take
+  place           NN    place
+  at              IN    at
+  the             DT    the
+  British         JJ    British
+  Council         NNP   Council
+  in              IN    in
+  Jerusalem       NNP   Jerusalem
+  .               .     .
+  The             DT    The
+  Israeli         JJ    Israeli
+  authorities     NNS   authority
+  often           RB    often
+  take            VB    take
+  action          NN    action
+  against         IN    against
+  events          NNS   event
+  in              IN    in
+  East            NNP   East
+  Jerusalem       NNP   Jerusalem
+  they            PRP   they
+  see             VB    see
+  as              IN    as
+  connected       VBN   connected
+  to              TO    to
+  the             DT    the
+  Palestinian     JJ    Palestinian
+  Authority       NNP   Authority
+  .               .     .
+  Saturday        NNP   Saturday
+  's              POS   's
+  opening         NN    opening
+  event           NN    event
+  at              IN    at
+  the             DT    the
+  same            JJ    same
+  theatre         NN    theatre
+  was             VBD   was
+  also            RB    also
+  shut            VBN   shut
+  down            RB    down
+  .               .     .
+  A               DT    A
+  police          NN    police
+  notice          NN    notice
+  said            VBD   said
+  the             DT    the
+  closure         NN    closure
+  was             VBD   was
+  on              IN    on
+  the             DT    the
+  orders          NNS   order
+  of              IN    of
+  Israel          NNP   Israel
+  's              POS   's
+  internal        JJ    internal
+  security        NN    security
+  minister        NN    minister
+  on              IN    on
+  the             DT    the
+  grounds         NNS   ground
+  of              IN    of
+  a               DT    a
+  breach          NN    breach
+  of              IN    of
+  interim         JJ    interim
+  peace           NN    peace
+  accords         NNS   accord
+  from            IN    from
+  the             DT    the
+  1990            NN    1990
+  s               PRP   s
+  .               .     .
+  These           DT    These
+  laid            VBN   laid
+  the             DT    the
+  framework       NN    framework
+  for             IN    for
+  talks           NNS   talk
+  on              IN    on
+  establishing    VBG   establishing
+  a               DT    a
+  Palestinian     JJ    Palestinian
+  state           NN    state
+  alongside       IN    alongside
+  Israel          NNP   Israel
+  ,               ,     ,
+  but             CC    but
+  left            VBN   left
+  the             DT    the
+  status          NN    status
+  of              IN    of
+  Jerusalem       NNP   Jerusalem
+  to              TO    to
+  be              VB    be
+  determined      VBN   determined
+  by              IN    by
+  further         JJ    further
+  negotiation     NN    negotiation
+  .               .     .
+  Israel          NNP   Israel
+  has             VBZ   has
+  annexed         VBD   annexed
+  East            NNP   East
+  Jerusalem       NNP   Jerusalem
+  and             CC    and
+  declares        VBZ   declares
+  it              PRP   it
+  part            NN    part
+  of              IN    of
+  its             PRP$  its
+  eternal         JJ    eternal
+  capital         NN    capital
+  .               .     .
+  Palestinians    NNPS  Palestinian
+  hope            NN    hope
+  to              TO    to
+  establish       VB    establish
+  their           PRP$  their
+  capital         NN    capital
+  in              IN    in
+  the             DT    the
+  area            NN    area
+  .               .     .
 
 Let's now apply the extractor.
 

Modified: topia.postag/trunk/src/topia/postag/extract.py
===================================================================
--- topia.postag/trunk/src/topia/postag/extract.py	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/extract.py	2009-05-30 15:16:11 UTC (rev 100552)
@@ -22,10 +22,19 @@
 SEARCH = 0
 NOUN = 1
 
-def defaultFilter(word, occur, strength):
-    return ((strength == 1 and occur >= 3) or
-            (strength >= 2))
+def permissiveFilter(word, occur, strength):
+    return True
 
+class DefaultFilter(object):
+
+    def __init__(self, singleStrengthMinOccur=3, noLimitStrength=2):
+        self.singleStrengthMinOccur = singleStrengthMinOccur
+        self.noLimitStrength = noLimitStrength
+
+    def __call__(self, word, occur, strength):
+        return ((strength == 1 and occur >= self.singleStrengthMinOccur) or
+                (strength >= self.noLimitStrength))
+
 def _add(term, norm, keyword, keywords):
     keyword.append((term, norm))
     keywords.setdefault(norm, 0)
@@ -34,11 +43,13 @@
 class KeywordExtractor(object):
     zope.interface.implements(interfaces.IKeywordExtractor)
 
-    def __init__(self, tagger=None, filter=defaultFilter):
+    def __init__(self, tagger=None, filter=None):
         if tagger is None:
             tagger = tag.Tagger()
             tagger.initialize()
         self.tagger = tagger
+        if filter is None:
+            filter = DefaultFilter()
         self.filter = filter
 
     def extract(self, terms):

Modified: topia.postag/trunk/src/topia/postag/tag.py
===================================================================
--- topia.postag/trunk/src/topia/postag/tag.py	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/tag.py	2009-05-30 15:16:11 UTC (rev 100552)
@@ -46,6 +46,21 @@
             tagged_term[0] = tagged_term[2] = lower_term
             tagged_term[1] = lower_tag
 
+def determineVerbAfterModal(idx, tagged_term, tagged_terms, lexicon):
+    "Determine the verb after a modal verb to avoid accidental noun detection."
+    term, tag, norm = tagged_term
+    if tag != 'MD':
+        return
+    len_terms = len(tagged_terms)
+    idx += 1
+    while idx < len_terms:
+        if tagged_terms[idx][1] == 'RB':
+            idx += 1
+            continue
+        if tagged_terms[idx][1] == 'NN':
+            tagged_terms[idx][1] = 'VB'
+        break
+
 def normalizePluralForms(idx, tagged_term, tagged_terms, lexicon):
     term, tag, norm = tagged_term
     if tag in ('NNS', 'NNPS') and term == norm:
@@ -75,6 +90,7 @@
     rules = (
         correctDefaultNounTag,
         verifyProperNounAtSentenceStart,
+        determineVerbAfterModal,
         normalizePluralForms,
         )
 

Modified: topia.postag/trunk/src/topia/postag/tests.py
===================================================================
--- topia.postag/trunk/src/topia/postag/tests.py	2009-05-30 15:13:25 UTC (rev 100551)
+++ topia.postag/trunk/src/topia/postag/tests.py	2009-05-30 15:16:11 UTC (rev 100552)
@@ -20,6 +20,13 @@
 from zope.testing import doctest
 from zope.testing.doctestunit import DocFileSuite
 
+def printTaggedTerms(terms):
+    for term, tag, norm in terms:
+        print (
+            term + ' '*(16-len(term)) +
+            tag + ' '*(6-len(tag)) +
+            norm )
+
 def test_suite():
     return unittest.TestSuite((
         DocFileSuite(
@@ -28,6 +35,7 @@
             ),
         DocFileSuite(
             'example.txt',
+            globs={'printTaggedTerms': printTaggedTerms},
             optionflags=doctest.NORMALIZE_WHITESPACE|doctest.ELLIPSIS,
             ),
         ))



More information about the Checkins mailing list