[Zope-Checkins] SVN: Zope/trunk/src/Products/ZCTextIndex/ Treat fullwidth space characters defined in Unicode as valid whitespace.

Wed Feb 3 10:13:48 EST 2010

Log message for revision 108734:
  Treat fullwidth space characters defined in Unicode as valid whitespace.
  Patch by Manabu TERADA.
  

Changed:
  U   Zope/trunk/src/Products/ZCTextIndex/QueryParser.py
  U   Zope/trunk/src/Products/ZCTextIndex/tests/testQueryParser.py

-=-
Modified: Zope/trunk/src/Products/ZCTextIndex/QueryParser.py
===================================================================

--- Zope/trunk/src/Products/ZCTextIndex/QueryParser.py	2010-02-03 14:15:23 UTC (rev 108733)
+++ Zope/trunk/src/Products/ZCTextIndex/QueryParser.py	2010-02-03 15:13:48 UTC (rev 108734)
@@ -94,6 +94,11 @@
     )
 """, re.VERBOSE)
 
+# Use unicode regex to treat fullwidth space characters defined in Unicode
+# as valid whitespace.
+_tokenizer_unicode_regex = re.compile(
+    _tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
+
 class QueryParser:
 
     implements(IQueryParser)
@@ -109,7 +114,13 @@
 
     def parseQuery(self, query):
         # Lexical analysis.
-        tokens = _tokenizer_regex.findall(query)
+        try:
+            # Try to use unicode and treat fullwidth whitespace as valid one.
+            if not isinstance(query, unicode):
+                query = query.decode('utf-8')
+            tokens = _tokenizer_unicode_regex.findall(query)
+        except UnicodeDecodeError:
+            tokens = _tokenizer_regex.findall(query)
         self._tokens = tokens
         # classify tokens
         self._tokentypes = [_keywords.get(token.upper(), _ATOM)

Modified: Zope/trunk/src/Products/ZCTextIndex/tests/testQueryParser.py
===================================================================
--- Zope/trunk/src/Products/ZCTextIndex/tests/testQueryParser.py	2010-02-03 14:15:23 UTC (rev 108733)
+++ Zope/trunk/src/Products/ZCTextIndex/tests/testQueryParser.py	2010-02-03 15:13:48 UTC (rev 108734)
@@ -210,6 +210,18 @@
         self.expect("foo* bar", AndNode([GlobNode("foo*"),
                                          AtomNode("bar")]))
 
+    def test024(self):
+        # Split by UTF-8 fullwidth space
+        from Products.ZCTextIndex.ParseTree import AndNode
+        from Products.ZCTextIndex.ParseTree import AtomNode
+        self.expect("foo\xe3\x80\x80bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test025(self):
+        # Split by Unicode fullwidth space
+        from Products.ZCTextIndex.ParseTree import AndNode
+        from Products.ZCTextIndex.ParseTree import AtomNode
+        self.expect(u"foo\u3000bar", AndNode([AtomNode(u"foo"), AtomNode(u"bar")]))
+
     def test101(self):
         self.failure("")