gh-121267: Improve performance of tarfile (#121267) (#121269)
https://github.com/python/cpython/commit/2b2d607095335024e5e2bb358e3ef376505... commit: 2b2d607095335024e5e2bb358e3ef37650536839 branch: main author: Johan Förberg <johan@forberg.se> committer: hauntsaninja <12621235+hauntsaninja@users.noreply.github.com> date: 2024-10-30T15:08:30-07:00 summary: gh-121267: Improve performance of tarfile (#121267) (#121269) Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2 --------- Co-authored-by: Tian Gao <gaogaotiantian@hotmail.com> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Shantanu <12621235+hauntsaninja@users.noreply.github.com> files: A Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst M Lib/tarfile.py diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 1475b3da2d3293..a0fab46b24e249 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1760,6 +1760,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added + self._unames = {} # Cached mappings of uid -> uname + self._gnames = {} # Cached mappings of gid -> gname try: if self.mode == "r": @@ -2138,16 +2140,23 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): tarinfo.mtime = statres.st_mtime tarinfo.type = type tarinfo.linkname = linkname + + # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To + # speed things up, cache the resolved usernames and group names. if pwd: - try: - tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] - except KeyError: - pass + if tarinfo.uid not in self._unames: + try: + self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] + except KeyError: + self._unames[tarinfo.uid] = '' + tarinfo.uname = self._unames[tarinfo.uid] if grp: - try: - tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] - except KeyError: - pass + if tarinfo.gid not in self._gnames: + try: + self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] + except KeyError: + self._gnames[tarinfo.gid] = '' + tarinfo.gname = self._gnames[tarinfo.gid] if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst new file mode 100644 index 00000000000000..9e52405c15a82d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst @@ -0,0 +1,2 @@ +Improve the performance of :mod:`tarfile` when writing files, by caching user names +and group names.
participants (1)
-
hauntsaninja