r46505 - python/trunk/Tools/pybench/systimes.py
Author: marc-andre.lemburg Date: Sun May 28 19:46:58 2006 New Revision: 46505 Added: python/trunk/Tools/pybench/systimes.py (contents, props changed) Log: Initial version of systimes - a module to provide platform dependent performance measurements. The module is currently just a proof-of-concept implementation, but will integrated into pybench once it is stable enough. License: pybench license. Author: Marc-Andre Lemburg. Added: python/trunk/Tools/pybench/systimes.py ============================================================================== --- (empty file) +++ python/trunk/Tools/pybench/systimes.py Sun May 28 19:46:58 2006 @@ -0,0 +1,197 @@ +#!/usr/bin/env python + +""" systimes() user and system timer implementations for use by + pybench. + + This module implements various different strategies for measuring + performance timings. It tries to choose the best available method + based on the platforma and available tools. + + On Windows, it is recommended to have the Mark Hammond win32 + package installed. Alternatively, the Thomas Heller ctypes + packages can also be used. + + On Unix systems, the standard resource module provides the highest + resolution timings. Unfortunately, it is not available on all Unix + platforms. + + If no supported timing methods based on process time can be found, + the module reverts to the highest resolution wall-time timer + instead. The system time part will then always be 0.0. + + The module exports one public API: + + def systimes(): + + Return the current timer values for measuring user and system + time as tuple of seconds (user_time, system_time). + + Copyright (c) 2006, Marc-Andre Lemburg (mal@egenix.com). See the + documentation for further information on copyrights, or contact + the author. All Rights Reserved. + +""" +import time, sys, struct + +# +# Note: Please keep this module compatible to Python 1.5.2. +# +# TODOs: +# +# * Add ctypes wrapper for new clock_gettime() real-time POSIX APIs; +# these will then provide nano-second resolution where available. +# +# * Add a function that returns the resolution of systimes() +# values, ie. systimesres(). +# + +### Choose an implementation + +SYSTIMES_IMPLEMENTATION = None +USE_CTYPES_GETPROCESSTIMES = 'cytpes GetProcessTimes() wrapper' +USE_WIN32PROCESS_GETPROCESSTIMES = 'win32process.GetProcessTimes()' +USE_RESOURCE_GETRUSAGE = 'resource.getrusage()' +USE_PROCESS_TIME_CLOCK = 'time.clock() (process time)' +USE_WALL_TIME_CLOCK = 'time.clock() (wall-time)' +USE_WALL_TIME_TIME = 'time.time() (wall-time)' + +if sys.platform[:3] == 'win': + # Windows platform + try: + import win32process + except ImportError: + try: + import ctypes + except ImportError: + # Use the wall-time implementation time.clock(), since this + # is the highest resolution clock available on Windows + SYSTIMES_IMPLEMENTATION = USE_WALL_TIME_CLOCK + else: + SYSTIMES_IMPLEMENTATION = USE_CTYPES_GETPROCESSTIMES + else: + SYSTIMES_IMPLEMENTATION = USE_WIN32PROCESS_GETPROCESSTIMES +else: + # All other platforms + try: + import resource + except ImportError: + pass + else: + SYSTIMES_IMPLEMENTATION = USE_RESOURCE_GETRUSAGE + +# Fall-back solution +if SYSTIMES_IMPLEMENTATION is None: + # Check whether we can use time.clock() as approximation + # for systimes() + start = time.clock() + time.sleep(0.1) + stop = time.clock() + if stop - start < 0.001: + # Looks like time.clock() is usable (and measures process + # time) + SYSTIMES_IMPLEMENTATION = USE_PROCESS_TIME_CLOCK + else: + # Use wall-time implementation time.time() since this provides + # the highest resolution clock on most systems + SYSTIMES_IMPLEMENTATION = USE_WALL_TIME_TIME + +### Implementations + +def getrusage_systimes(): + return resource.getrusage(resource.RUSAGE_SELF)[:2] + +def process_time_clock_systimes(): + return (time.clock(), 0.0) + +def wall_time_clock_systimes(): + return (time.clock(), 0.0) + +def wall_time_time_systimes(): + return (time.time(), 0.0) + +# Number of clock ticks per second for the values returned +# by GetProcessTimes() on Windows. +# +# Note: Ticks returned by GetProcessTimes() are micro-seconds on +# Windows XP (though the docs say 100ns intervals) +WIN32_PROCESS_TIMES_TICKS_PER_SECOND = 10e6 + +def win32process_getprocesstimes_systimes(): + d = win32process.GetProcessTimes(win32process.GetCurrentProcess()) + # Note: I'm not sure whether KernelTime on Windows is the same as + # system time on Unix - I've yet to see a non-zero value for + # KernelTime on Windows. + return (d['UserTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND, + d['KernelTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND) + +def ctypes_getprocesstimes_systimes(): + creationtime = ctypes.c_ulonglong() + exittime = ctypes.c_ulonglong() + kerneltime = ctypes.c_ulonglong() + usertime = ctypes.c_ulonglong() + rc = ctypes.windll.kernel32.GetProcessTimes( + ctypes.windll.kernel32.GetCurrentProcess(), + ctypes.byref(creationtime), + ctypes.byref(exittime), + ctypes.byref(kerneltime), + ctypes.byref(usertime)) + if not rc: + raise TypeError('GetProcessTimes() returned an error') + return (usertime.value / WIN32_PROCESS_TIMES_TICKS_PER_SECOND, + kerneltime.value / WIN32_PROCESS_TIMES_TICKS_PER_SECOND) + +# Select the default for the systimes() function + +if SYSTIMES_IMPLEMENTATION is USE_RESOURCE_GETRUSAGE: + systimes = getrusage_systimes + +elif SYSTIMES_IMPLEMENTATION is USE_PROCESS_TIME_CLOCK: + systimes = process_time_clock_systimes + +elif SYSTIMES_IMPLEMENTATION is USE_WALL_TIME_CLOCK: + systimes = wall_time_clock_systimes + +elif SYSTIMES_IMPLEMENTATION is USE_WALL_TIME_TIME: + systimes = wall_time_time_systimes + +elif SYSTIMES_IMPLEMENTATION is USE_WIN32PROCESS_GETPROCESSTIMES: + systimes = win32process_getprocesstimes_systimes + +elif SYSTIMES_IMPLEMENTATION is USE_CTYPES_GETPROCESSTIMES: + systimes = ctypes_getprocesstimes_systimes + +else: + raise TypeError('no suitable systimes() implementation found') + +### Testing + +def some_workload(): + x = 0L + for i in xrange(10000000L): + x = x + 1L + +def test_workload(): + print 'Testing systimes() under load conditions' + t0 = systimes() + some_workload() + t1 = systimes() + print 'before:', t0 + print 'after:', t1 + print 'differences:', (t1[0] - t0[0], t1[1] - t0[1]) + print + +def test_idle(): + print 'Testing systimes() under idle conditions' + t0 = systimes() + time.sleep(1) + t1 = systimes() + print 'before:', t0 + print 'after:', t1 + print 'differences:', (t1[0] - t0[0], t1[1] - t0[1]) + print + +if __name__ == '__main__': + print 'Using %s as timer' % SYSTIMES_IMPLEMENTATION + print + test_workload() + test_idle()
Author: marc-andre.lemburg Date: Sun May 28 19:46:58 2006 New Revision: 46505
Added: python/trunk/Tools/pybench/systimes.py (contents, props changed)
...
+def win32process_getprocesstimes_systimes(): + d = win32process.GetProcessTimes(win32process.GetCurrentProcess()) + # Note: I'm not sure whether KernelTime on Windows is the same as + # system time on Unix - I've yet to see a non-zero value for + # KernelTime on Windows. + return (d['UserTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND, + d['KernelTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND)
FYI, I always see a non-zero, and increasing, value for KernelTime on my box (WinXP Pro SP2). Alas, these counters appear to have even worse resolution than time.time() on my box, incrementing a seemingly variable(!) number of times per second, but well under 100Hz: """ import win32process def scream(): from time import clock as now p = win32process.GetCurrentProcess() d = {} count = 0 start = now() deadline = start + 1.0 while now() < deadline: count += 1 x = win32process.GetProcessTimes(p) d[x['UserTime']] = 1 elapsed = now() - start print "saw", len(d), "distinct values in", \ count, "tries across", elapsed, "seconds" print "user", x['UserTime'], "kernel", x['KernelTime'] print scream() scream() scream() """ One run of that here: saw 74 distinct values in 134542 tries across 1.00000616787 seconds user 11562500 kernel 2187500 saw 81 distinct values in 133365 tries across 1.00000220027 seconds user 24062500 kernel 2968750 saw 68 distinct values in 132984 tries across 1.0000078902 seconds user 34531250 kernel 4687500
Tim Peters wrote:
Author: marc-andre.lemburg Date: Sun May 28 19:46:58 2006 New Revision: 46505
Added: python/trunk/Tools/pybench/systimes.py (contents, props changed)
...
+def win32process_getprocesstimes_systimes(): + d = win32process.GetProcessTimes(win32process.GetCurrentProcess()) + # Note: I'm not sure whether KernelTime on Windows is the same as + # system time on Unix - I've yet to see a non-zero value for + # KernelTime on Windows. + return (d['UserTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND, + d['KernelTime'] / WIN32_PROCESS_TIMES_TICKS_PER_SECOND)
FYI, I always see a non-zero, and increasing, value for KernelTime on my box (WinXP Pro SP2).
Using your code I now see non-zero values as well - perhaps KernelTime refers to calls being made to the win32 APIs ?! I tested using standard Python code, such as long for loops which don't call out to win32 APIs.
Alas, these counters appear to have even worse resolution than time.time() on my box, incrementing a seemingly variable(!) number of times per second, but well under 100Hz:
""" import win32process
def scream(): from time import clock as now p = win32process.GetCurrentProcess() d = {} count = 0 start = now() deadline = start + 1.0 while now() < deadline: count += 1 x = win32process.GetProcessTimes(p) d[x['UserTime']] = 1 elapsed = now() - start print "saw", len(d), "distinct values in", \ count, "tries across", elapsed, "seconds" print "user", x['UserTime'], "kernel", x['KernelTime'] print
scream() scream() scream() """
One run of that here:
saw 74 distinct values in 134542 tries across 1.00000616787 seconds user 11562500 kernel 2187500
saw 81 distinct values in 133365 tries across 1.00000220027 seconds user 24062500 kernel 2968750
saw 68 distinct values in 132984 tries across 1.0000078902 seconds user 34531250 kernel 4687500
Question is whether this is really bad resolution or just the effect of the OS not fetching the timer value in real-time, but using some variable that is only updated at 50-100 Hz. It is interesting to see how WinXP assigns slots to the processes (the elapsed ticks values are number of ticks seen in 1 second). saw 44 distinct values in 145439 tries across 1.00002849524 seconds user 7500000 kernel 4062500 elapsed kernel ticks: 3281250 elapsed user ticks: 6718750 saw 48 distinct values in 146123 tries across 1.00000726349 seconds user 15156250 kernel 6562500 elapsed kernel ticks: 2500000 elapsed user ticks: 7343750 saw 45 distinct values in 146262 tries across 1.00000279365 seconds user 22187500 kernel 9531250 elapsed kernel ticks: 2968750 elapsed user ticks: 6875000 Here's the modified code: """ import win32process def scream(): from time import clock as now p = win32process.GetCurrentProcess() d = {} count = 0 start = now() deadline = start + 1.0 while now() < deadline: count += 1 x = win32process.GetProcessTimes(p) d[x['UserTime']] = 1 elapsed = now() - start print "saw", len(d), "distinct values in", \ count, "tries across", elapsed, "seconds" print "user", x['UserTime'], "kernel", x['KernelTime'] print scream() scream() scream() """ -- Marc-Andre Lemburg eGenix.com Professional Python Services directly from the Source (#1, Jun 02 2006)
Python/Zope Consulting and Support ... http://www.egenix.com/ mxODBC.Zope.Database.Adapter ... http://zope.egenix.com/ mxODBC, mxDateTime, mxTextTools ... http://python.egenix.com/
2006-07-03: EuroPython 2006, CERN, Switzerland 30 days left ::: Try mxODBC.Zope.DA for Windows,Linux,Solaris,FreeBSD for free ! ::::
M.-A. Lemburg wrote:
FYI, I always see a non-zero, and increasing, value for KernelTime on my box (WinXP Pro SP2).
Using your code I now see non-zero values as well - perhaps KernelTime refers to calls being made to the win32 APIs ?!
huh? what else would it refer to ? (what do you think "system time" refers to on a Unix machine?)
One run of that here:
saw 74 distinct values in 134542 tries across 1.00000616787 seconds user 11562500 kernel 2187500
saw 81 distinct values in 133365 tries across 1.00000220027 seconds user 24062500 kernel 2968750
saw 68 distinct values in 132984 tries across 1.0000078902 seconds user 34531250 kernel 4687500
Question is whether this is really bad resolution or just the effect of the OS not fetching the timer value in real-time, but using some variable that is only updated at 50-100 Hz.
but, but, but... didn't I just point out that process time is based on *sampling* of the program counter by the system's scheduler, not on measurements using some high-performance timer. the standard tick interval on a Windows machine is 10-15 ms, depending on platform. (to check the tick on your machine, you can write a simple "time.time() - time.time() until you get back a non-zero value" will reveal, or use http://www.sysinternals.com/Utilities/ClockRes.html ). there is no process timer. this is getting too weird for me. I need some coffee and a break. </F>
Fredrik Lundh wrote:
M.-A. Lemburg wrote:
FYI, I always see a non-zero, and increasing, value for KernelTime on my box (WinXP Pro SP2). Using your code I now see non-zero values as well - perhaps KernelTime refers to calls being made to the win32 APIs ?!
huh? what else would it refer to ?
Not sure, that's why I'm asking. The MS docs say: "A pointer to a FILETIME structure that receives the amount of time that the process has executed in kernel mode.". http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dllproc/bas...
(what do you think "system time" refers to on a Unix machine?)
man 3 times: "The tms_stime field contains the CPU time spent in the system while executing tasks on behalf of the calling process." tms_stime refers to "system time". Both definitions are, well, fuzzy :-) E.g. it is not clear whether time.sleep(10) or time spent waiting for I/O should be considered system time, even though the system is spending time on behalf of the process, albeit usually giving it to other processes. I just wonder whether they both refer to the same fuzzy definition.
One run of that here:
saw 74 distinct values in 134542 tries across 1.00000616787 seconds user 11562500 kernel 2187500
saw 81 distinct values in 133365 tries across 1.00000220027 seconds user 24062500 kernel 2968750
saw 68 distinct values in 132984 tries across 1.0000078902 seconds user 34531250 kernel 4687500 Question is whether this is really bad resolution or just the effect of the OS not fetching the timer value in real-time, but using some variable that is only updated at 50-100 Hz.
but, but, but... didn't I just point out that process time is based on *sampling* of the program counter by the system's scheduler, not on measurements using some high-performance timer. the standard tick interval on a Windows machine is 10-15 ms, depending on platform. (to check the tick on your machine, you can write a simple "time.time() - time.time() until you get back a non-zero value" will reveal, or use http://www.sysinternals.com/Utilities/ClockRes.html ). there is no process timer.
Maybe there's just a misunderstanding of terms here: what I refer to as timer is really just some counter that gets incremented at fixed intervals. This could be an interrupt based counter such as the jiffies counter on Linux or a CPU based one such as the one used by the clock_gettime() real-time POSIX APIs. The question is whether the process time is accounted for in an accurate way or not, ie. which timer the system scheduler uses to update the process time counter and what it's minimum time slice is. If the scheduler just allows an accuracy of 10ms, then process time will only ever be as accurate as that. If you have code to run that uses less CPU time, a wall-clock timer (in hardware) will likely give you more accurate results, than counting clock ticks. OTOH, if you repeat the code execution for several seconds, then a process time based approach will give you more accurate results (unless you happen to run on a real-time OS with fixed and guaranteed scheduling time slices). -- Marc-Andre Lemburg eGenix.com Professional Python Services directly from the Source (#1, Jun 02 2006)
Python/Zope Consulting and Support ... http://www.egenix.com/ mxODBC.Zope.Database.Adapter ... http://zope.egenix.com/ mxODBC, mxDateTime, mxTextTools ... http://python.egenix.com/
2006-07-03: EuroPython 2006, CERN, Switzerland 30 days left ::: Try mxODBC.Zope.DA for Windows,Linux,Solaris,FreeBSD for free ! ::::
[Marc-Andre]
Using your code I now see non-zero values as well - perhaps KernelTime refers to calls being made to the win32 APIs ?!
[Fredrik]
huh? what else would it refer to ?
[Marc-Andre]
Not sure, that's why I'm asking. The MS docs say: "A pointer to a FILETIME structure that receives the amount of time that the process has executed in kernel mode.".
My understanding is that the docs are correct there, and "kernel mode" could just as well (perhaps better) be called "privileged mode". The OS kernel runs at a higher privilege level, which is partly enforced by modern CPU hardware. For example, code triggered by HW interrupts runs in privileged mode, and can execute instructions that the HW refuses to execute in "user mode", and access the entirety of physical RAM directly. Device drivers usually run in privileged/kernel mode too. This isn't the same as "Win32 API call", neither is it the same as anything else that can make instant sense to a non-OS hacker. More here: http://en.wikipedia.org/wiki/CPU_modes http://en.wikipedia.org/wiki/Architecture_of_the_Windows_NT_operating_system... In any case, distinguishing between user time and system time is pretty much pointless in most benchmarks. I want to know how long the code takes, and couldn't care less how much of it runs with some CPU privilege bit set (unless, as mentioned before, I'm running on a timesharing mainframe and get charged different rates for user time than for system time). BTW, it may be the case that time waiting for a page fault to get resolved from disk doesn't get charged against "user time" or "kernel time" (depends on OS details). But if a code change is made that significantly increases or decreases page faults, that's certainly something a benchmarker wants to know about. A high-resolution wall-clock timer is typically the only way to approach measuring that kind of thing directly. Example: I ran this on WinXP just now: """ i = 0 d = {} while True: i += 1 d[i] = 1 if i % 1000000 == 0: print i """ It quickly brought my box to its knees, but _total_ system-wide "user time" and "kernel time" essentially stopped increasing. The paging time wasn't getting charged to anything. Likewise after hitting Ctrl-C, the time required to swap all of the dict back in from disk (in order to decref millions of int objects) didn't appear to be charged to anything either. The CPU was simply idle most of that time, waiting for disk traffic.
Tim Peters wrote:
My understanding is that the docs are correct there, and "kernel mode" could just as well (perhaps better) be called "privileged mode". The OS kernel runs at a higher privilege level, which is partly enforced by modern CPU hardware. For example, code triggered by HW interrupts runs in privileged mode, and can execute instructions that the HW refuses to execute in "user mode", and access the entirety of physical RAM directly. Device drivers usually run in privileged/kernel mode too. This isn't the same as "Win32 API call", neither is it the same as anything else that can make instant sense to a non-OS hacker.
well, yes and no -- the time needed to handle HW interrupts, kernel processes, etc will of course be assigned to those processes, not your user process. when the scheduler interrupts your process to "handle a tick", it knows very well what process it was interrupting, and whether that process was running in privileged mode or not. (and for an ordinary user process running in the win32 subsystem, the best way to end up in kernel space is to call a win32 api function that (directly or indirectly) requires kernel-level support).
In any case, distinguishing between user time and system time is pretty much pointless in most benchmarks. I want to know how long the code takes, and couldn't care less how much of it runs with some CPU privilege bit set (unless, as mentioned before, I'm running on a timesharing mainframe and get charged different rates for user time than for system time).
if you're running on a mainframe, chances are that you have excellent hardware support for this ;-)
BTW, it may be the case that time waiting for a page fault to get resolved from disk doesn't get charged against "user time" or "kernel time" (depends on OS details).
oh, I find it hard to believe that any modern operating system would consider waiting for I/O to finish to be a user process issue...
But if a code change is made that significantly increases or decreases page faults, that's certainly something a benchmarker wants to know about. A high-resolution wall-clock timer is typically the only way to approach measuring that kind of thing directly.
absolutely. but I don't think we have that kind of tests in PyBench; they are all interpreter microbenchmarks, and virtually the only thing that can cause them to end up in kernel space on their own is memory allocations... </F>
participants (4)
-
Fredrik Lundh -
M.-A. Lemburg -
marc-andre.lemburg -
Tim Peters