[Python-ideas] Optimizing global names via dict references
Franklin? Lee
leewangzhong+python at gmail.com
Sat Dec 19 09:11:31 EST 2015
Dictionary References: An Idea in Three Emails
Act 3: Sample pseudocode
Table of Contents:
Act 1: Stuff
https://mail.python.org/pipermail/python-ideas/2015-December/037511.html
- Definitions
- Problem statement
- The idea
- The benefits
- The costs
- What will hold a RefCell?
- Add `getref` to `dict`
- Implementation issues
- Interpreters other than CPython
Act 2: More stuff
https://mail.python.org/pipermail/python-ideas/2015-December/037512.html
- Extension: Scope ancestry
- Aside: Classes and scopes do not have the same rules
- Extension: Read/write refs
- Extension: Read-only refs
- Option: Alternative RefCell structure
- Option: Replace the methods when detaching a RefCell
- Extension: Dynamic scoping
- Extension: Thread safety
Act 3: Sample pseudocode
https://mail.python.org/pipermail/python-ideas/2015-December/037513.html
- ScopeDict
- Scope ancestry
- Alternative RefCell structure
- Thread safety
== Pseudocode: ScopeDict ==
The CPython dict implementation looks like this (simplified):
class KVPair:
__slots__ = {
'key': object | DUMMY | NULL,
'value': object | NULL,
}
class dict:
__slots__ = {
'table': List[KVPair],
'size': int,
}
def lookup(self, key): # returns the entry, or an empty entry
...
def __getitem__(self, key):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY]:
raise KeyError
return entry.value
def __setitem__(self, key, value):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY]:
entry.key = key
entry.value = value
self.maybe_resize()
def __delitem__(self, key):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY]:
raise KeyError
entry.key = DUMMY
entry.value = NULL
self.maybe_resize()
def resize(self):
old_table = self.table
self.table = [KVPair() for i in range(self.predict_size())]
for k, v in old_table:
self[k] = v
I want to add a `refs` member to hold the RefCells, and a `getref`
function to acquire RefCells.
(I make redundant lookups for demonstration. They are unnecessary in C.)
(In C, self.table.index(entry) is a simple pointer subtraction.)
(In the actual implementation, we could save a direct pointer to the
KVPair's value pointer.)
class ScopeDict(dict):
__slots__ = {
'refs': List[RefCell | NULL],
}
def getref(self, key, force_create: bool):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY]:
if force_create:
entry.key = key
entry.value = NULL
else:
raise KeyError
# Get the index.
i = self.table.index(entry)
cell = self.refs[i] = RefCell()
cell.indirect = True
cell.pointer = entry
def __getitem__(self, key):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY] or entry.value is NULL:
raise KeyError
return entry.value
def __delitem__(self, key):
entry = self.lookup(key)
if entry.key in [NULL, DUMMY] or entry.value is NULL:
raise KeyError
entry.value = NULL
def resize(self):
old_table = self.table
self.table = [KVPair() for i in range(self.predict_size())]
old_refs = self.refs
self.refs = [NULL] * len(self.table)
for ref, (key, value) in zip(old_refs, old_table):
self[key] = value
if ref is NULL:
continue
# Update the ref
entry = self.lookup(key)
index = self.table.index(entry)
self.refs[index] = ref
ref.pointer = entry
def __del__(self): # with illustrative deletes
for ref, entry in zip(self.refs, self.table):
delete entry.key
if ref is not NULL:
ref.pointer = entry.value
ref.indirect = False
delete ref
else:
delete entry.value
class RefCell:
__slots__ = {
'indirect': bool,
'pointer': (KVPair | object),
}
def get(self):
if self.indirect:
value = self.pointer.value
else:
value = self.pointer
if value is NULL:
raise KeyError
return value
def __del__(self):
if not self.indirect:
delete self.pointer
== Pseudocode: Scope ancestry ==
Algorithm for ref creation:
(This is complex, because of the decision-making process of whether to
create a RefCell.)
def ChainScopeDict.getref(self, key,
force_create: bool, recursive: bool = True):
# Try the normal getref.
try:
ref = super().getref(key, force_create=force_create)
except KeyError: # We don't have this key.
if not recursive:
raise #give up
else:
ref = None
# We now know:
assert recursive or ref is not None
if recursive:
# Try to find a parent with a refcell
for i, parent in enumerate(self.parents):
try:
parent_ref = parent.getref(key,
force_create=False, recursive=True)
good_parent = i
break
except:
continue
else: # No parent has a RefCell for this key
if ref is None:
assert not force_create
raise KeyError(key)
else:
parent_ref = None
else:
assert ref is not None
parent_ref = None
assert parent_ref is not None or ref is not None
if ref is None:
assert parent_ref is not None and force_create is False
ref = super().getref(key, force_create=True)
# Hack to save on pseudocode:
ref.__class__ = ChainRefCell
if parent_ref is None:
assert force_create or key in self
# Create no parent refs. It will look up parents later.
ref.parents = self.parents.copy()
return ref
# Create all refs up to the found parent_ref.
# (Don't create refs for the later parents.)
ref.parents = ([p.getref(key, force_create=True)
for p in self.parents[:good_parent]] +
[parent_ref] +
self.parents[good_parent + 1:])
return ref
Algorithm for chain resolution:
def ChainRefCell.get(self, recursive: bool):
key = self.key
try: # Own value.
return super().get()
except KeyError:
if not recursive: # Don't search parents.
raise
pass
# Index of latest parent which is/has a RefCell.
# We want to create intermediate RefCells for all things in between.
last_parent_cell = 0
for i, parent in enumerate(self.parents):
# We want a parent RefCell, not a parent dict
if isinstance(parent, ScopeDict):
try:
parent = parent.getref(key, force_create=False)
except KeyError:
continue
# Don't need the parent dict anymore.
self.parents[i] = parent
last_parent_cell = i # This parent is now a refcell.
# Try to get the parent to resolve.
try:
# `recursive=False` for class-like hierarchy;
# see below
value = parent.get(recursive=True)
except KeyError:
continue
else:
# No parent has a value.
value = NULL
# Create refs in the parents which come before a refcell.
# This prevents repeated failed lookups.
for i, parent in enumerate(self.parents[:last_parent_cell]):
if isinstance(parent, ScopeDict):
self.parents[i] = parent.getref(key, force_create=True)
if value is NULL:
raise KeyError
return value
== Pseudocode: Alternative RefCell structure ==
This allows us to have a reference to the key (for KeyError(key))
without taking up an extra bool for checking the dictionary.
class RefCell:
__slots__ = {
'_key': (NULL | object),
'_value': (KVPair | object),
}
""" Two possibilities:
1. _key == NULL and isinstance(_value, KVPair)
=> the dict is alive.
2. isinstance(_key, object) and isinstance(_value, object)
=> this RefCell has been released from its dict.
(KVPair and NULL are not (py-)objects.)
So we use _key as a way to tell whether the dict is
alive, and to tell whether _value points to the
value or to the table entry.
"""
def get(self):
if self._key is NULL: # The pointer is to the KVPair.
key = self._value.key
value = self._value.value
else: # The pointer is to the value.
key = self._key
value = self._value
if value is NULL:
raise KeyError(key)
return value
def __del__(self):
if self._key is not NULL:
delete self._key
delete self._value
# Otherwise, it doesn't own the references.
def key(self):
if self._key is NULL: # Look it up in the dict.
return self._value.key
else:
return self._key
== Pseudocode: Thread safety ==
for i, (ref, (k, v)) in enumerate(zip(oldrefs, oldtable)):
# Remove refcells which only have one ref,
# then remove keys without either value or refcell.
# No need to lock a refcell with only one ref.
isdead = self.remove_if_dead(i)
if isdead:
continue
if ref is not NULL:
lock(ref)
ref._value = v
ref._key = k
unlock(ref)
else:
delete k
delete v
More information about the Python-ideas
mailing list