I didn't find any way to improve the actual overflow check, although if you entirely replace the "fast path" check with checks involving unsigned masking, you get some performance improvement. For a wide variety of input patterns, I get about an 18% speedup versus the core long multiply code, when modified as shown below: #define UL_LO_HI_BIT (((unsigned long)1) << (sizeof(unsigned long) * 4U)) #define UL_LO_MASK ((UL_LO_HI_BIT) - 1) #define UL_HI_MASK (~(UL_LO_MASK)) #define UL_HI_LO_BIT (((unsigned long)1) << ((sizeof(unsigned long) * 4U)-1)) #define UL_OVERFLOW_IMPOSSIBLE_MASK ((UL_HI_LO_BIT) - 1) #define UL_OVERFLOW_POSSIBLE_MASK (~(UL_OVERFLOW_IMPOSSIBLE_MASK)) long core_int_mul(long a, long b) { long longprod = a * b; unsigned long ma = a & UL_HI_MASK; if (ma == ((a < 0) ? UL_HI_MASK : 0)) { unsigned long mb = b & UL_OVERFLOW_POSSIBLE_MASK; if (mb == ((b < 0) ? UL_OVERFLOW_POSSIBLE_MASK : 0)) { return longprod; } } { double doubleprod = (double)a * (double)b; double doubled_longprod = (double)longprod; double diff = doubled_longprod - doubleprod; double absdiff = (diff >= 0.0) ? diff : -diff; double absprod = (doubleprod >= 0.0) ? doubleprod : -doubleprod; /* absdiff/absprod <= 1/32 iff 32 * absdiff <= absprod -- 5 good bits is "close enough" */ if (32.0 * absdiff <= absprod) { return longprod; } else { SIGNAL_AN_ERROR; } } } This version suffers no apparent degradation versus the existing implementation when fed sets of multiplicands evenly distributed over range(-sys.maxint, sys.maxint), and almost always shows an improvement. Shall I submit a patch? -Jerry