[Zodb-checkins] CVS: ZODB3/ZEO - simul.py:1.12.8.2.18.8
Jeremy Hylton
cvs-admin at zope.org
Thu Dec 4 01:29:45 EST 2003
Update of /cvs-repository/ZODB3/ZEO
In directory cvs.zope.org:/tmp/cvs-serv1592
Modified Files:
Tag: Zope-2_6-branch
simul.py
Log Message:
Commit buggy implementation of an ARC-style cache.
It was difficult to realize the ARC approach for a cache with variable
sized objects. There are still a bunch of boundary conditions that
seem wrong. And the list-isn't-empty assertions fail sometimes.
Add apparently useless option to CircularCache to skip things near the
front of the LRU list. The approach is lousy and slow and probably
can't be improved, because it's expensive to keep track of the
contents of the front of the LRU list.
=== ZODB3/ZEO/simul.py 1.12.8.2.18.7 => 1.12.8.2.18.8 ===
--- ZODB3/ZEO/simul.py:1.12.8.2.18.7 Wed Dec 3 01:02:54 2003
+++ ZODB3/ZEO/simul.py Thu Dec 4 01:29:43 2003
@@ -43,9 +43,9 @@
MB = 1000*1000
cachelimit = 20*MB
simclass = ZEOCacheSimulation
- omicron = None
+ theta = omicron = None
try:
- opts, args = getopt.getopt(sys.argv[1:], "bflyz2cOs:o:")
+ opts, args = getopt.getopt(sys.argv[1:], "bflyz2cOas:o:t:")
except getopt.error, msg:
usage(msg)
return 2
@@ -68,8 +68,12 @@
simclass = CircularCacheSimulation
if o == '-o':
omicron = float(a)
+ if o == '-t':
+ theta = float(a)
if o == '-O':
simclass = OracleSimulation
+ if o == '-a':
+ simclass = ARCCacheSimulation
if len(args) != 1:
usage("exactly one file argument required")
return 2
@@ -104,8 +108,8 @@
return 1
# Create simulation object
- if omicron is not None:
- sim = simclass(cachelimit, omicron)
+ if omicron is not None or theta is not None:
+ sim = simclass(cachelimit, omicron, theta)
elif simclass is OracleSimulation:
sim = simclass(cachelimit, filename)
else:
@@ -172,7 +176,7 @@
self.total_writes = 0
if not hasattr(self, "extras"):
self.extras = (self.extraname,)
- self.format = self.format + " %6s" * len(self.extras)
+ self.format = self.format + " %7s" * len(self.extras)
# Reset per-run statistics and set up simulation data
self.restart()
@@ -468,7 +472,7 @@
# object based, so, for example, it's hard to compute the number
# of oids to store in a1out based on the size of a1in.
- extras = "evicts", "hothit"
+ extras = "evicts", "hothit", "am_add"
def __init__(self, cachelimit, outlen=10000):
Simulation.__init__(self, cachelimit)
@@ -483,6 +487,7 @@
self.total_evicts = 0
self.total_hothit = 0
+ self.total_am_add = 0
self.a1out_limit = outlen
# An LRU queue of hot objects
@@ -578,6 +583,8 @@
self.am_size += node.size
node.linkbefore(self.am)
self.cache[oid] = node
+ self.am_add += 1
+ self.total_am_add += 1
else:
# New objects enter the cache via a1in. If they
# are frequently used over a long enough time, they
@@ -611,12 +618,13 @@
self.evicts = 0
self.hothit = 0
+ self.am_add = 0
class Node2Q(Node):
__slots__ = ["kind"]
- def __init__(self, oid, size, kind):
+ def __init__(self, oid, size, kind=None):
Node.__init__(self, oid, size)
self.kind = kind
@@ -625,14 +633,256 @@
self.kind = next.kind
Node.linkbefore(self, next)
+lruT = object()
+lruB = object()
+fifoT = object()
+fifoB = object()
+
+class ARCCacheSimulation(Simulation):
+
+ # Based on the paper ARC: A Self-Tuning, Low Overhead Replacement
+ # Cache by Nimrod Megiddo and Dharmendra S. Modha, USENIX FAST
+ # 2003. The paper describes a block-based cache. A lot of the
+ # details need to be fiddled to work with an object-based cache.
+ # For size issues, the key insight ended up being conditions
+ # A.1-A.5 rather than the details of the algorithm in Fig. 4.
+
+ extras = "lruThits", "evicts", "p"
+
+ def __init__(self, cachelimit):
+ Simulation.__init__(self, cachelimit)
+ # There are two pairs of linked lists. Each pair has a top and
+ # bottom half. The bottom half contains metadata, but not actual
+ # objects.
+
+ # LRU list of frequently used objects
+ self.lruT = Node2Q(None, None, lruT)
+ self.lruT.linkbefore(self.lruT)
+ self.lruT_len = 0
+ self.lruT_size = 0
+
+ self.lruB = Node2Q(None, None, lruB)
+ self.lruB.linkbefore(self.lruB)
+ self.lruB_len = 0
+ self.lruB_size = 0
+
+ # FIFO list of objects seen once
+ self.fifoT = Node2Q(None, None, fifoT)
+ self.fifoT.linkbefore(self.fifoT)
+ self.fifoT_len = 0
+ self.fifoT_size = 0
+
+ self.fifoB = Node2Q(None, None, fifoB)
+ self.fifoB.linkbefore(self.fifoB)
+ self.fifoB_len = 0
+ self.fifoB_size = 0
+
+ # maps oid to node
+ self.cache = {}
+
+ # The paper says that p should be adjust be 1 as the minimum:
+ # "The compound effect of such small increments and decrements
+ # to p s quite profound as we will demonstrated in the next
+ # section." Not really, as far as I can tell. In my traces
+ # with a very small cache, it was taking far too long to adjust
+ # towards favoring some FIFO component. I would guess that the
+ # chief difference is that our caches are much bigger than the
+ # ones they experimented with. Their biggest cache had 512K
+ # entries, while our smallest cache will have 40 times that many
+ # entries.
+
+ self.p = 0
+ # XXX multiply computed adjustments to p by walk_factor
+ self.walk_factor = 500
+
+ # statistics
+ self.total_hits = 0
+ self.total_lruThits = 0
+ self.total_fifoThits = 0
+ self.total_evicts = 0
+
+ def restart(self):
+ Simulation.restart(self)
+ self.hits = 0
+ self.lruThits = 0
+ self.fifoThits = 0
+ self.evicts = 0
+
+ def write(self, oid, size):
+ pass
+
+ def replace(self, lruB=False):
+ self.evicts += 1
+ self.total_evicts += 1
+ if self.fifoT_size > self.p or (lruB and self.fifoT_size == self.p):
+ node = self.fifoT.next
+ assert node is not self.fifoT, self.stats()
+ node.linkbefore(self.fifoB)
+ self.fifoT_len -= 1
+ self.fifoT_size -= node.size
+ self.fifoB_len += 1
+ self.fifoB_size += node.size
+ else:
+ node = self.lruT.next
+ assert node is not self.lruT, self.stats()
+ node.linkbefore(self.lruB)
+ self.lruT_len -= 1
+ self.lruT_size -= node.size
+ self.lruB_len += 1
+ self.lruB_size += node.size
+ return node.size
+
+ def stats(self):
+ self.totalsize = self.lruT_size + self.fifoT_size
+ self.allsize = self.totalsize + self.lruB_size + self.fifoB_size
+ print "cachelimit = %s totalsize=%s allsize=%s" % (
+ addcommas(self.cachelimit),
+ addcommas(self.totalsize),
+ addcommas(self.allsize))
+
+ fmt = (
+ "p=%(p)d\n"
+ "lruT = %(lruT_len)5d / %(lruT_size)8d / %(lruThits)d\n"
+ "fifoT = %(fifoT_len)5d / %(fifoT_size)8d / %(fifoThits)d\n"
+ "lruB = %(lruB_len)5d / %(lruB_size)8d\n"
+ "fifoB = %(fifoB_len)5d / %(fifoB_size)8d\n"
+ "loads=%(loads)d hits=%(hits)d evicts=%(evicts)d\n"
+ )
+ print fmt % self.__dict__
+
+ def report(self):
+ self.total_p = self.p
+ Simulation.report(self)
+## self.stats()
+
+ def load(self, oid, size):
+## maybe(self.stats, p=0.002)
+ node = self.cache.get(oid)
+ if node is None:
+ # cache miss: We're going to insert a new object in fifoT.
+ # If fifo is full, we'll need to evict something to make
+ # room for it.
+
+ need = size
+ while need > 0:
+ if size + self.fifoT_size + self.fifoB_size >= self.cachelimit:
+ if need + self.fifoT_size >= self.cachelimit:
+ node = self.fifoB.next
+ assert node is not self.fifoB, self.stats()
+ node.unlink()
+ del self.cache[node.oid]
+ self.fifoB_size -= node.size
+ self.fifoB_len -= 1
+ self.evicts += 1
+ self.total_evicts += 1
+ else:
+ node = self.fifoB.next
+ assert node is not self.fifoB, self.stats()
+ node.unlink()
+ del self.cache[node.oid]
+ self.fifoB_size -= node.size
+ self.fifoB_len -= 1
+ if self.fifoT_size + self.lruT_size > self.cachelimit:
+ need -= self.replace()
+ else:
+ incache_size = self.fifoT_size + self.lruT_size + need
+ total_size = (incache_size + self.fifoB_size
+ + self.lruB_size)
+ if total_size >= self.cachelimit * 2:
+ node = self.lruB.next
+ assert node is not self.lruB
+ node.unlink()
+ del self.cache[node.oid]
+ self.lruB_size -= node.size
+ self.lruB_len -= 1
+ elif incache_size > self.cachelimit:
+ need -= self.replace()
+ else:
+ break
+
+ node = Node2Q(oid, size)
+ node.linkbefore(self.fifoT)
+ self.fifoT_len += 1
+ self.fifoT_size += size
+ self.cache[oid] = node
+ else:
+ # a cache hit, but possibly in a bottom list that doesn't
+ # actually hold the object
+ if node.kind is lruT:
+ node.linkbefore(self.lruT)
+
+ self.hits += 1
+ self.total_hits += 1
+ self.lruThits += 1
+ self.total_lruThits += 1
+
+ elif node.kind is fifoT:
+ node.linkbefore(self.lruT)
+ self.fifoT_len -= 1
+ self.lruT_len += 1
+ self.fifoT_size -= node.size
+ self.lruT_size += node.size
+
+ self.hits += 1
+ self.total_hits += 1
+ self.fifoThits += 1
+ self.total_fifoThits += 1
+
+ elif node.kind is fifoB:
+ node.linkbefore(self.lruT)
+ self.fifoB_len -= 1
+ self.lruT_len += 1
+ self.fifoB_size -= node.size
+ self.lruT_size += node.size
+
+ # XXX need a better min than 1?
+## print "adapt+", max(1, self.lruB_size // self.fifoB_size)
+ delta = max(1, self.lruB_size // max(1, self.fifoB_size))
+ self.p += delta * self.walk_factor
+ if self.p > self.cachelimit:
+ self.p = self.cachelimit
+
+ need = node.size
+ if self.lruT_size + self.fifoT_size + need > self.cachelimit:
+ while need > 0:
+ need -= self.replace()
+
+ elif node.kind is lruB:
+ node.linkbefore(self.lruT)
+ self.lruB_len -= 1
+ self.lruT_len += 1
+ self.lruB_size -= node.size
+ self.lruT_size += node.size
+
+ # XXX need a better min than 1?
+## print "adapt-", max(1, self.fifoB_size // self.lruB_size)
+ delta = max(1, self.fifoB_size // max(1, self.lruB_size))
+ self.p -= delta * self.walk_factor
+ if self.p < 0:
+ self.p = 0
+
+ need = node.size
+ if self.lruT_size + self.fifoT_size + need > self.cachelimit:
+ while need > 0:
+ need -= self.replace(lruB=True)
+
+ def inval(self, oid):
+ pass
+
+ def extraheader(self):
+ pass
+
class OracleSimulation(LRUCacheSimulation):
# Not sure how to implement this yet. This is a cache where I
# cheat to see how good we could actually do. The cache
# replacement problem for multi-size caches is NP-hard, so we're
- # not going to have an optimal solution. Somehow, prime the
- # cache with a list of objects that will be accessed more than
- # once and only cache those objects.
+ # not going to have an optimal solution.
+
+ # At the moment, the oracle is mostly blind. It knows which
+ # objects will be referenced more than once, so that it can
+ # ignore objects referenced only once. In most traces, these
+ # objects account for about 20% of references.
def __init__(self, cachelimit, filename):
LRUCacheSimulation.__init__(self, cachelimit)
@@ -700,35 +950,43 @@
# depending on how far away they are now. The omicron parameter
# specifies a percentage
- extras = "evicts", "copies"
+ extras = "evicts", "copies", "inuse", "skips"
- def __init__(self, cachelimit, omicron=0):
+ def __init__(self, cachelimit, omicron=None, skip=None):
Simulation.__init__(self, cachelimit)
- self.omicron = omicron
+ self.omicron = omicron or 0
+ self.skip = skip or 0
self.total_evicts = 0
self.total_copies = 0
+ self.total_skips = 0
# Current offset in file
self.offset = 0
# Map offset in file to tuple of size, oid
self.filemap = {0: (self.cachelimit, None)}
- # Map oid to offset
+ # Map oid to offset, node
self.cache = {}
+ # LRU list of oids
+ self.head = Node(None, None)
+ self.head.linkbefore(self.head)
def extraheader(self):
- print "omicron = %s" % self.omicron
+ print "omicron = %s, theta = %s" % (self.omicron, self.skip)
def restart(self):
Simulation.restart(self)
self.evicts = 0
self.copies = 0
+ self.skips = 0
def load(self, oid, size):
- pos = self.cache.get(oid)
- if pos is None:
+ p = self.cache.get(oid)
+ if p is None:
self.add(oid, size)
else:
+ pos, node = p
self.hits += 1
self.total_hits += 1
+ node.linkbefore(self.head)
self.copy(oid, size, pos)
def check(self):
@@ -737,31 +995,27 @@
while d:
pos, (size, oid) = d.popitem()
next = pos + size
- assert next in d or next in done or next == self.cachelimit
+ if not (next in d or next in done or next == self.cachelimit):
+ print "check", next, pos, size, repr(oid)
+ self.dump()
+ raise RuntimeError
done[pos] = pos
- print "checked", self.offset
def dump(self):
+ print len(self.filemap)
L = list(self.filemap)
L.sort()
- print "cache size", len(L)
- next = None
- for pos in L:
- size, oid = self.filemap[pos]
- if oid:
- print pos, size, repr(oid), self.cache[oid]
- else:
- print pos, size, None
- if next is not None:
- assert next == pos
- next = pos + size
- print
+ for k in L:
+ v = self.filemap[k]
+ print k, v[0], repr(v[1])
def add(self, oid, size):
avail = self.makeroom(size)
assert oid not in self.cache
self.filemap[self.offset] = size, oid
- self.cache[oid] = self.offset
+ node = Node(oid, size)
+ node.linkbefore(self.head)
+ self.cache[oid] = self.offset, node
self.offset += size
# All the space made available must be accounted for in filemap.
excess = avail - size
@@ -773,20 +1027,52 @@
self.offset = 0
pos = self.offset
# Evict enough objects to make the necessary space available.
+ self.compute_closeenough()
+ evicted = False
while need > 0:
+ if pos == self.cachelimit:
+ print "wrap makeroom", need
+ pos = 0
try:
- size, oid = self.filemap.pop(pos)
+ size, oid = self.filemap[pos]
except:
+ self.dump()
raise
-
+
+ if not evicted and self.skip and oid and self.closeenough(oid):
+ self.skips += 1
+ self.total_skips += 1
+ self.offset += size
+ pos += size
+ continue
+
+ evicted = True
+ del self.filemap[pos]
+
if oid is not None:
self.evicts += 1
self.total_evicts += 1
- del self.cache[oid]
- pos += size
+ pos, node = self.cache.pop(oid)
+ node.unlink()
need -= size
+ pos += size
+
return pos - self.offset
+ def compute_closeenough(self):
+ self.lru = {}
+ n = int(len(self.cache) * self.skip)
+ node = self.head.prev
+ while n > 0:
+ self.lru[node.oid] = True
+ node = node.prev
+ n -= 1
+
+ def closeenough(self, oid):
+ # If oid is in the top portion of the most recently used
+ # elements, skip it.
+ return oid in self.lru
+
def copy(self, oid, size, pos):
# Copy only if the distance is greater than omicron.
dist = self.offset - pos
@@ -796,24 +1082,28 @@
self.copies += 1
self.total_copies += 1
self.filemap[pos] = size, None
- del self.cache[oid]
+ pos, node = self.cache.pop(oid)
+ node.unlink()
self.add(oid, size)
def inval(self, oid):
- pos = self.cache.get(oid)
- if pos is None:
+ p = self.cache.get(oid)
+ if p is None:
return
+ pos, node = p
self.invals += 1
self.total_invals += 1
size, _oid = self.filemap[pos]
assert oid == _oid
self.filemap[pos] = size, None
- del self.cache[oid]
+ pos, node = self.cache.pop(oid)
+ node.unlink()
def write(self, oid, size):
- pos = self.cache.get(oid)
- if pos is None:
+ p = self.cache.get(oid)
+ if p is None:
return
+ pos, node = p
oldsize, _oid = self.filemap[pos]
assert oid == _oid
if size == oldsize:
@@ -824,9 +1114,23 @@
self.filemap[pos + size] = excess, None
else:
self.filemap[pos] = oldsize, None
- del self.cache[oid]
+ pos, node = self.cache.pop(oid)
+ node.unlink()
self.add(oid, size)
+ def report(self):
+ free = used = total = 0
+ for size, oid in self.filemap.itervalues():
+ total += size
+ if oid:
+ used += size
+ else:
+ free += size
+
+ self.inuse = round(100.0 * used / total, 1)
+ self.total_inuse = self.inuse
+ Simulation.report(self)
+
class BuddyCacheSimulation(LRUCacheSimulation):
def __init__(self, cachelimit):
@@ -1146,6 +1450,12 @@
s = s[:i] + ',' + s[i:]
i -= 3
return sign + s
+
+import random
+
+def maybe(f, p=0.5):
+ if random.random() < p:
+ f()
if __name__ == "__main__":
sys.exit(main())
More information about the Zodb-checkins
mailing list