[Zope-dev] ZCatalog : UTF-8 Chinese
Sin Hang Kin
kentsin@poboxes.com
Mon, 25 Sep 2000 22:43:56 +0800
This is a multi-part message in MIME format.
------=_NextPart_000_0632_01C02742.16A7E2C0
Content-Type: text/plain;
charset="Windows-1252"
Content-Transfer-Encoding: 7bit
HI,
I have a project which need search with Chinese. I think I can make Zcatalog
to search Chinese in utf8. So I change Voodoo Kludge Splitter.py to convert
the input string to unicode (assume it is utf8) and make a version of split
of my own (see the attached splitter.py). I borrow (stolen) from Interscript
the utf8 encoding conversion scheme. I separate the chinese with space by
hand hoping the Zcatalog will work.
After changing these, I have a catalog which looks good : I can see from the
volucably the chinese were actually there (except with some which have html
encode like < inside the utf8.
I generate the search interface, and test it. However, the search of the
index terms return nothings. I search most entries found in the vocubalury
but none works, those work will return many unwanted results also.
What is causing this failure? What I can do to go further?
Rgs,
Kent Sin
---------------------------------
kentsin.weblogs.com
kentsin.imeme.net
------=_NextPart_000_0632_01C02742.16A7E2C0
Content-Type: application/octet-stream;
name="Splitter.py"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
filename="Splitter.py"
"""Doc String"""
import string
# Borrow from Interscript
#
def utf8(i):
if i < 0x80:
return chr(i)
if i < 0x800:
return chr(0xC0 | (i>>6) & 0x1F)+\
chr(0x80 | i & 0x3F)
if i < 0x10000L:
return chr(0xE0 | (i>>12) & 0xF)+\
chr(0x80 | (i>>6) & 0x3F)+\
chr(0x80 | i & 0x3F)
if i < 0x200000L:
return chr(0xF0 | (i>>18) & 0x7)+\
chr(0x80 | (i>>12) & 0x3F)+\
chr(0x80 | (i>>6) & 0x3F)+\
chr(0x80 | i & 0x3F)
if i < 0x4000000L:
return chr(0xF8 | (i>>24) & 0x3)+\
chr(0x80 | (i>>18) & 0x3F)+\
chr(0x80 | (i>>12) & 0x3F)+\
chr(0x80 | (i>>6) & 0x3F)+\
chr(0x80 | i & 0x3F)
return chr(0xFC | (i>>30) & 0x1)+\
chr(0x80 | (i>>24) & 0x3F)+\
chr(0x80 | (i>>18) & 0x3F)+\
chr(0x80 | (i>>12) & 0x3F)+\
chr(0x80 | (i>>6) & 0x3F)+\
chr(0x80 | i & 0x3F)
def seq_to_utf8(a):
s =3D ''
for ch in a: s =3D s + utf8(ch)
return s
def parse_utf8(s,i):
lead =3D ord(s[i])
if lead & 0x80 =3D=3D 0:
return lead & 0x7F,i+1 # ASCII
if lead & 0xE0 =3D=3D 0xC0:
return ((lead & 0x1F) << 6)|\
(ord(s[i+1]) & 0x3F),i+2
if lead & 0xF0 =3D=3D 0xE0:
return ((lead & 0x1F)<<12)|\
((ord(s[i+1]) & 0x3F) <<6)|\
(ord(s[i+2]) & 0x3F),i+3
if lead & 0xF8 =3D=3D 0xF0:
return ((lead & 0x1F)<<18)|\
((ord(s[i+1]) & 0x3F) <<12)|\
((ord(s[i+2]) & 0x3F) <<6)|\
(ord(s[i+3]) & 0x3F),i+4
if lead & 0xFC =3D=3D 0xF8:
return ((lead & 0x1F)<<24)|\
((ord(s[i+1]) & 0x3F) <<18)|\
((ord(s[i+2]) & 0x3F) <<12)|\
((ord(s[i+3]) & 0x3F) <<6)|\
(ord(s[i+4]) & 0x3F),i+5
if lead & 0xFE =3D=3D 0xFC:
return ((lead & 0x1F)<<30)|\
((ord(s[i+1]) & 0x3F) <<24)|\
((ord(s[i+2]) & 0x3F) <<18)|\
((ord(s[i+3]) & 0x3F) <<12)|\
((ord(s[i+4]) & 0x3F) <<6)|\
(ord(s[i+5]) & 0x3F),i+6
return lead, i+1 # error, just use bad character
def lower(x):
if 64<x<90:
return x+32
else:
return x
def split(us, ch=3D''):
l =3D []
c =3D ''
for i in us:
if i < 48 or 57 < i < 65 or 90 < i < 97 or 122<i<127: # Start =
new word
if c!=3D'':
l.append(c)
c =3D ''
else: # do nothing
pass
else:
c =3D c+ utf8(lower(i))
return l
find =3D string.find
class Splitter:
"""Doc String"""
def __init__(self, insrc, stop_word_dic=3D{}):
self.isrc =3D insrc
i =3D 0
us =3D []
while (i<len(insrc)):
j, i =3D parse_utf8(insrc,i)
us.append(j)
tempsrc =3D split(us, ' ')
stop_words =3D stop_word_dic.keys()
xx=3D[]
llen=3Dlen
for x in tempsrc:
if x and llen(x) > 1 and not x in stop_words : =
xx.append(x)
self.src =3D xx
def __getslice__(self, a, b):
return self.src[a:b]
def __getitem__(self, a):
return self.src[a]
def __len__(self):
return len(self.src)
def indexes(self, a):
res =3D[]
for i in range(len(self)):
if self[i]=3D=3Da:
res.append(i)
return res
def pos(self, a):
i =3D int(a/2)
x =3D find(self.isrc, self[i])
return (x, x+len(self[i]) )
------=_NextPart_000_0632_01C02742.16A7E2C0--