[Zope-dev] ZCatalog : UTF-8 Chinese

Sin Hang Kin kentsin@poboxes.com
Mon, 25 Sep 2000 22:43:56 +0800


This is a multi-part message in MIME format.

------=_NextPart_000_0632_01C02742.16A7E2C0
Content-Type: text/plain;
	charset="Windows-1252"
Content-Transfer-Encoding: 7bit

HI,

I have a project which need search with Chinese. I think I can make Zcatalog
to search Chinese in utf8. So I change Voodoo Kludge Splitter.py to convert
the input string to unicode (assume it is utf8) and make a version of split
of my own (see the attached splitter.py). I borrow (stolen) from Interscript
the utf8 encoding conversion scheme. I separate the chinese with space by
hand hoping the Zcatalog will work.

After changing these, I have a catalog which looks good : I can see from the
volucably the chinese were actually there (except with some which have html
encode like < inside the utf8.

I generate the search interface, and test it. However, the search of the
index terms return nothings. I search most entries found in the vocubalury
but none works, those work will return many unwanted results also.

What is causing this failure? What I can do to go further?


Rgs,

Kent Sin
---------------------------------
kentsin.weblogs.com
kentsin.imeme.net

------=_NextPart_000_0632_01C02742.16A7E2C0
Content-Type: application/octet-stream;
	name="Splitter.py"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="Splitter.py"

"""Doc String"""

import string

# Borrow from Interscript
#
def utf8(i):
  if i < 0x80:
    return chr(i)
  if i < 0x800:
    return chr(0xC0 | (i>>6) & 0x1F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x10000L:
    return chr(0xE0 | (i>>12) & 0xF)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x200000L:
    return chr(0xF0 | (i>>18) & 0x7)+\
      chr(0x80 | (i>>12) & 0x3F)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x4000000L:
    return chr(0xF8 | (i>>24) & 0x3)+\
      chr(0x80 | (i>>18) & 0x3F)+\
      chr(0x80 | (i>>12) & 0x3F)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  return chr(0xFC | (i>>30) & 0x1)+\
    chr(0x80 | (i>>24) & 0x3F)+\
    chr(0x80 | (i>>18) & 0x3F)+\
    chr(0x80 | (i>>12) & 0x3F)+\
    chr(0x80 | (i>>6) & 0x3F)+\
    chr(0x80 | i & 0x3F)

def seq_to_utf8(a):
  s =3D ''
  for ch in a: s =3D s + utf8(ch)
  return s

def parse_utf8(s,i):
  lead =3D ord(s[i])
  if lead & 0x80 =3D=3D 0:
    return lead & 0x7F,i+1 # ASCII
  if lead & 0xE0 =3D=3D 0xC0:
    return ((lead & 0x1F) << 6)|\
      (ord(s[i+1]) & 0x3F),i+2
  if lead & 0xF0 =3D=3D 0xE0:
    return ((lead & 0x1F)<<12)|\
      ((ord(s[i+1]) & 0x3F) <<6)|\
      (ord(s[i+2]) & 0x3F),i+3
  if lead & 0xF8 =3D=3D 0xF0:
    return ((lead & 0x1F)<<18)|\
      ((ord(s[i+1]) & 0x3F) <<12)|\
      ((ord(s[i+2]) & 0x3F) <<6)|\
      (ord(s[i+3]) & 0x3F),i+4
  if lead & 0xFC =3D=3D 0xF8:
    return ((lead & 0x1F)<<24)|\
      ((ord(s[i+1]) & 0x3F) <<18)|\
      ((ord(s[i+2]) & 0x3F) <<12)|\
      ((ord(s[i+3]) & 0x3F) <<6)|\
      (ord(s[i+4]) & 0x3F),i+5
  if lead & 0xFE =3D=3D 0xFC:
    return ((lead & 0x1F)<<30)|\
      ((ord(s[i+1]) & 0x3F) <<24)|\
      ((ord(s[i+2]) & 0x3F) <<18)|\
      ((ord(s[i+3]) & 0x3F) <<12)|\
      ((ord(s[i+4]) & 0x3F) <<6)|\
      (ord(s[i+5]) & 0x3F),i+6
  return lead, i+1 # error, just use bad character

def lower(x):
    if 64<x<90:
        return x+32
    else:
        return x

def split(us, ch=3D''):
    l =3D []
    c =3D ''
    for i in us:
        if i < 48 or 57 < i < 65 or 90 < i < 97 or 122<i<127: # Start =
new word
            if c!=3D'':
                l.append(c)
                c =3D ''
            else: # do nothing
                pass
        else:
            c =3D c+ utf8(lower(i))
    return l

find =3D string.find

class Splitter:
        """Doc String"""

        def __init__(self, insrc, stop_word_dic=3D{}):
                self.isrc =3D insrc
                i =3D 0
                us =3D []
                while (i<len(insrc)):
                    j, i =3D parse_utf8(insrc,i)
                    us.append(j)
                tempsrc =3D split(us, ' ')
                stop_words =3D stop_word_dic.keys()
                xx=3D[]
                llen=3Dlen
                for x in tempsrc:
                        if x and llen(x) > 1 and not x in stop_words : =
xx.append(x)
                self.src =3D xx


        def __getslice__(self, a, b):
                return self.src[a:b]

        def __getitem__(self, a):
                return self.src[a]

        def __len__(self):
                return len(self.src)

        def indexes(self, a):
                res =3D[]
                for i in range(len(self)):
                        if self[i]=3D=3Da:
                                res.append(i)
                return res

        def pos(self, a):
                i =3D int(a/2)
                x =3D find(self.isrc, self[i])
                return (x, x+len(self[i]) )


------=_NextPart_000_0632_01C02742.16A7E2C0--