[Python-ideas] Optimizing global names via dict references

Sat Dec 19 09:11:31 EST 2015

Dictionary References: An Idea in Three Emails

Act 3: Sample pseudocode

Table of Contents:

    Act 1: Stuff
    https://mail.python.org/pipermail/python-ideas/2015-December/037511.html
    - Definitions
    - Problem statement
    - The idea
    - The benefits
    - The costs
    - What will hold a RefCell?
    - Add `getref` to `dict`
    - Implementation issues
    - Interpreters other than CPython

    Act 2: More stuff
    https://mail.python.org/pipermail/python-ideas/2015-December/037512.html
    - Extension: Scope ancestry
    - Aside: Classes and scopes do not have the same rules
    - Extension: Read/write refs
    - Extension: Read-only refs
    - Option: Alternative RefCell structure
    - Option: Replace the methods when detaching a RefCell
    - Extension: Dynamic scoping
    - Extension: Thread safety

    Act 3: Sample pseudocode
    https://mail.python.org/pipermail/python-ideas/2015-December/037513.html
    - ScopeDict
    - Scope ancestry
    - Alternative RefCell structure
    - Thread safety

== Pseudocode: ScopeDict ==

The CPython dict implementation looks like this (simplified):

    class KVPair:
        __slots__ = {
            'key': object | DUMMY | NULL,
            'value': object | NULL,
        }

    class dict:
        __slots__ = {
            'table': List[KVPair],
            'size': int,
        }

        def lookup(self, key): # returns the entry, or an empty entry
            ...

        def __getitem__(self, key):
            entry = self.lookup(key)
            if entry.key in [NULL, DUMMY]:
                raise KeyError

            return entry.value

        def __setitem__(self, key, value):
            entry = self.lookup(key)

            if entry.key in [NULL, DUMMY]:
                entry.key = key

            entry.value = value

            self.maybe_resize()

        def __delitem__(self, key):
            entry = self.lookup(key)
            if entry.key in [NULL, DUMMY]:
                raise KeyError

            entry.key = DUMMY
            entry.value = NULL

            self.maybe_resize()

        def resize(self):
            old_table = self.table
            self.table = [KVPair() for i in range(self.predict_size())]
            for k, v in old_table:
                self[k] = v

I want to add a `refs` member to hold the RefCells, and a `getref`
function to acquire RefCells.

(I make redundant lookups for demonstration. They are unnecessary in C.)
(In C, self.table.index(entry) is a simple pointer subtraction.)
(In the actual implementation, we could save a direct pointer to the
KVPair's value pointer.)

    class ScopeDict(dict):
        __slots__ = {
            'refs': List[RefCell | NULL],
        }

        def getref(self, key, force_create: bool):
            entry = self.lookup(key)

            if entry.key in [NULL, DUMMY]:
                if force_create:
                    entry.key = key
                    entry.value = NULL
                else:
                    raise KeyError

            # Get the index.
            i = self.table.index(entry)

            cell = self.refs[i] = RefCell()
            cell.indirect = True
            cell.pointer = entry

        def __getitem__(self, key):
            entry = self.lookup(key)
            if entry.key in [NULL, DUMMY] or entry.value is NULL:
                raise KeyError

            return entry.value

        def __delitem__(self, key):
            entry = self.lookup(key)
            if entry.key in [NULL, DUMMY] or entry.value is NULL:
                raise KeyError

            entry.value = NULL

        def resize(self):

            old_table = self.table
            self.table = [KVPair() for i in range(self.predict_size())]
            old_refs = self.refs
            self.refs = [NULL] * len(self.table)

            for ref, (key, value) in zip(old_refs, old_table):
                self[key] = value

                if ref is NULL:
                    continue

                # Update the ref
                entry = self.lookup(key)
                index = self.table.index(entry)
                self.refs[index] = ref
                ref.pointer = entry

        def __del__(self): # with illustrative deletes

            for ref, entry in zip(self.refs, self.table):

                delete entry.key

                if ref is not NULL:
                    ref.pointer = entry.value
                    ref.indirect = False
                    delete ref
                else:
                    delete entry.value

    class RefCell:
        __slots__ = {
            'indirect': bool,
            'pointer': (KVPair | object),
        }

        def get(self):
            if self.indirect:
                value = self.pointer.value
            else:
                value = self.pointer

            if value is NULL:
                raise KeyError

            return value

        def __del__(self):
            if not self.indirect:
                delete self.pointer

== Pseudocode: Scope ancestry ==

Algorithm for ref creation:
(This is complex, because of the decision-making process of whether to
create a RefCell.)

    def ChainScopeDict.getref(self, key,
            force_create: bool, recursive: bool = True):

        # Try the normal getref.
        try:
            ref = super().getref(key, force_create=force_create)
        except KeyError: # We don't have this key.
            if not recursive:
                raise #give up
            else:
                ref = None

        # We now know:
        assert recursive or ref is not None

        if recursive:
            # Try to find a parent with a refcell
            for i, parent in enumerate(self.parents):
                try:
                    parent_ref = parent.getref(key,
force_create=False, recursive=True)
                    good_parent = i
                    break
                except:
                    continue
            else: # No parent has a RefCell for this key
                if ref is None:
                    assert not force_create
                    raise KeyError(key)
                else:
                    parent_ref = None
        else:
            assert ref is not None
            parent_ref = None

        assert parent_ref is not None or ref is not None

        if ref is None:
            assert parent_ref is not None and force_create is False
            ref = super().getref(key, force_create=True)

        # Hack to save on pseudocode:
        ref.__class__ = ChainRefCell

        if parent_ref is None:
            assert force_create or key in self
            # Create no parent refs. It will look up parents later.
            ref.parents = self.parents.copy()
            return ref

        # Create all refs up to the found parent_ref.
        # (Don't create refs for the later parents.)
        ref.parents = ([p.getref(key, force_create=True)
                            for p in self.parents[:good_parent]] +
                       [parent_ref] +
                       self.parents[good_parent + 1:])

        return ref

Algorithm for chain resolution:

    def ChainRefCell.get(self, recursive: bool):

        key = self.key

        try: # Own value.
            return super().get()
        except KeyError:
            if not recursive: # Don't search parents.
                raise
            pass

        # Index of latest parent which is/has a RefCell.
        # We want to create intermediate RefCells for all things in between.
        last_parent_cell = 0

        for i, parent in enumerate(self.parents):

            # We want a parent RefCell, not a parent dict
            if isinstance(parent, ScopeDict):
                try:
                    parent = parent.getref(key, force_create=False)
                except KeyError:
                    continue
                # Don't need the parent dict anymore.
                self.parents[i] = parent

            last_parent_cell = i # This parent is now a refcell.

            # Try to get the parent to resolve.
            try:
                # `recursive=False` for class-like hierarchy;
                # see below
                value = parent.get(recursive=True)
            except KeyError:
                continue
        else:
            # No parent has a value.
            value = NULL

        # Create refs in the parents which come before a refcell.
        # This prevents repeated failed lookups.
        for i, parent in enumerate(self.parents[:last_parent_cell]):
            if isinstance(parent, ScopeDict):
                self.parents[i] = parent.getref(key, force_create=True)

        if value is NULL:
            raise KeyError
        return value

== Pseudocode: Alternative RefCell structure ==

This allows us to have a reference to the key (for KeyError(key))
without taking up an extra bool for checking the dictionary.

    class RefCell:
        __slots__ = {
            '_key': (NULL | object),
            '_value': (KVPair | object),
        }
        """ Two possibilities:
            1. _key == NULL and isinstance(_value, KVPair)
                => the dict is alive.
            2. isinstance(_key, object)  and isinstance(_value, object)
                => this RefCell has been released from its dict.

            (KVPair and NULL are not (py-)objects.)

            So we use _key as a way to tell whether the dict is
            alive, and to tell whether _value points to the
            value or to the table entry.
        """

        def get(self):
            if self._key is NULL: # The pointer is to the KVPair.
                key = self._value.key
                value = self._value.value
            else: # The pointer is to the value.
                key = self._key
                value = self._value

            if value is NULL:
                raise KeyError(key)

            return value

        def __del__(self):
            if self._key is not NULL:
                delete self._key
                delete self._value
            # Otherwise, it doesn't own the references.

        def key(self):
            if self._key is NULL: # Look it up in the dict.
                return self._value.key
            else:
                return self._key

== Pseudocode: Thread safety ==

    for i, (ref, (k, v)) in enumerate(zip(oldrefs, oldtable)):

        # Remove refcells which only have one ref,
        # then remove keys without either value or refcell.
        # No need to lock a refcell with only one ref.
        isdead = self.remove_if_dead(i)

        if isdead:
            continue

        if ref is not NULL:
            lock(ref)

            ref._value = v
            ref._key = k
            unlock(ref)
        else:
            delete k
            delete v