[Zope-CVS] CVS: Products/ZCTextIndex - WidCode.py:1.3

Guido van Rossum guido@python.org
Tue, 14 May 2002 11:23:29 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv13284

Modified Files:
	WidCode.py 
Log Message:
There's no point in encoding the number of continuation bytes in the
first byte -- we always find the end of a particular encoded number by
searching for the next byte with the high bit set.  This simplifies
the encoding and gives us more space for small encodings: 128 values
can now be encoded in 1 byte, and 16K in 2 bytes.


=== Products/ZCTextIndex/WidCode.py 1.2 => 1.3 ===
 # desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
 # of an encoding can't appear in the interior of an encoding, so find() can't
-# be fooled into starting a match "in the middle" of an encoding.
+# be fooled into starting a match "in the middle" of an encoding. Unlike
+# UTF-8, the initial byte does not tell you how many continuation bytes
+# follow; and there's no ASCII superset property.
 
 # Details:
 #
 # + Only the first byte of an encoding has the sign bit set.
 #
-# + The number of bytes in the encoding is encoded in unary at the start of
-#   the first byte (i.e., an encoding with n bytes begins with n 1-bits
-#   followed by a 0 bit).
+# + The first byte has 7 bits of data.
 #
 # + Bytes beyond the first in an encoding have the sign bit clear, followed
 #   by 7 bits of data.
 #
-# + The number of data bits in the first byte of an encoding varies.
+# + The first byte doesn't tell you how many continuation bytes are
+#   following.  You can tell by searching for the next byte with the
+#   high bit set (or the end of the string).
 #
 # The int to be encoded can contain no more than 24 bits.
 # XXX this could certainly be increased
 #
-# If it contains no more than 6 bits, 00abcdef, the encoding is
-#     10abcdef
+# If it contains no more than 7 bits, 0abcdefg, the encoding is
+#     1abcdefg
 #
-# If it contains 7 thru 12 bits,
-#     0000abcd efghijkL
+# If it contains 8 thru 14 bits,
+#     00abcdef ghijkLmn
 # the encoding is
-#     110abcde 0fghijkL
+#     1abcdefg 0hijkLmn
 #
 # Static tables _encoding and _decoding capture all encodes and decodes for
-# 12 or fewer bits.
+# 14 or fewer bits.
 #
-# If it contains 13 thru 18 bits,
-#    000000ab cdefghij kLmnopqr
+# If it contains 15 thru 21 bits,
+#    000abcde fghijkLm nopqrstu
 # the encoding is
-#    1110abcd 0efghijk 0Lmnopqr
+#    1abcdefg 0hijkLmn 0opqrstu
 #
-# If it contains 19 thru 24 bits,
-#    abcdefgh ijkLmnop qrstuvwx
+# If it contains 22 thru 28 bits,
+#    0000abcd efghijkL mnopqrst uvwxyzAB
 # the encoding is
-#    11110abc 0defghij 0kLmnopq 0rstuvwx
+#    1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
 
+assert 0x80**2 == 0x4000
+assert 0x80**4 == 0x10000000
 
 import re
 
@@ -51,18 +55,18 @@
     n = len(wid2enc)
     return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
 
-_encoding = [None] * 0x1000 # Filled later, and converted to a tuple
+_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
 
 def _encode(w):
-    assert 0x1000 <= w < 0x1000000
+    assert 0x4000 <= w < 0x10000000
     b, c = divmod(w, 0x80)
     a, b = divmod(b, 0x80)
     s = chr(b) + chr(c)
-    if a < 0x10:    # no more than 18 data bits
-        return chr(a + 0xE0) + s
+    if a < 0x80:    # no more than 21 data bits
+        return chr(a + 0x80) + s
     a, b = divmod(a, 0x80)
-    assert a < 0x4, (w, a, b, s)  # else more than 24 data bits
-    return (chr(a + 0xF0) + chr(b)) + s
+    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
+    return (chr(a + 0x80) + chr(b)) + s
 
 _prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
 
@@ -81,22 +85,22 @@
         return 0
     if len(s) == 3:
         a, b, c = map(ord, s)
-        assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80
-        return ((a & 0xF) << 14) | (b << 7) | c
+        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
+        return ((a & 0x7F) << 14) | (b << 7) | c
     assert len(s) == 4, `s`
     a, b, c, d = map(ord, s)
-    assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80
-    return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d
+    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
+    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
 
 def _fill():
     global _encoding
-    for i in range(0x40):
+    for i in range(0x80):
         s = chr(i + 0x80)
         _encoding[i] = s
         _decoding[s] = i
-    for i in range(0x40, 0x1000):
+    for i in range(0x80, 0x4000):
         hi, lo = divmod(i, 0x80)
-        s = chr(hi + 0xC0) + chr(lo)
+        s = chr(hi + 0x80) + chr(lo)
         _encoding[i] = s
         _decoding[s] = i
     _encoding = tuple(_encoding)