[Twisted-Python] transport.write performance.

A simple TCP echo server using epoll reactor:this server process take 60% cpu on 4000 request/s. If use self.transport.getHandle().send instead of self.transport.write,it take 30% cpu on 4000 request/s. Why transport.write take more user cpu?Why twisted performance so poor?(echosvr.c using libevent only take 12% cpu on 4000 request/s) tsvr.py-----------------------------------------------------------import sys, time, random, socket, tracebackfrom twisted.internet import epollreactorepollreactor.install()from twisted.internet import defer, reactor, taskfrom twisted.internet.protocol import Protocol, Factoryfrom protocol import TCPServerProtocol def main(): tcpprotocol = TCPServerProtocol factory = Factory() factory.protocol = tcpprotocol reactor.listenTCP(9976, factory) reactor.run() if __name__ == '__main__': main() protocol.py--------------------------------------------------------- import socketimport datetimeimport tracebackfrom twisted.protocols.basic import LineReceiverfrom twisted.internet import protocol class TCPServerProtocol(LineReceiver): req_count = 0 req_time = datetime.datetime.now() def lineReceived(self, data): TCPServerProtocol.req_count+=1 if TCPServerProtocol.req_count%10000==0: ct = datetime.datetime.now() dt = ct-TCPServerProtocol.req_time pps = 10000/(dt.seconds+dt.microseconds/1000000.0) TCPServerProtocol.req_time=ct print('RPS='+str(pps)) try: #self.transport.write(data) self.transport.getHandle().send(data) except: traceback.print_exc() tcli.py -----------------------------------------------------------------import sysimport socketimport tracebackimport timeimport datetime host = 'localhost'port = 9976loopcount = 300sockcount = 5000RPS = 4000 ss=[]for x in xrange(sockcount): ss.append(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) ss[x].connect((host, port)) ss[x].settimeout(120) for x in xrange(10000000): st = datetime.datetime.now() for y in xrange(loopcount): try: if ss[x%sockcount]!=None: ss[x%sockcount].sendall('1234567890\r\n') ss[x%sockcount].recv(1024) except: print y sys.exit() time.sleep(0.1) dt = (datetime.datetime.now()-st) plc = loopcount/(dt.seconds+dt.microseconds/1000000.0) print loopcount/(dt.seconds+dt.microseconds/1000000.0) #auto adjust RPS if plc<RPS: if RPS-plc>50: loopcount+=10 else: if plc-RPS>50: loopcount-=10 echosvr.c ----------------------------------------------------------------------#include <stdio.h>#include <stdlib.h>#include <errno.h>#include <assert.h> #include <event2/event.h>#include <event2/bufferevent.h> #define LISTEN_PORT 9976#define LISTEN_BACKLOG 32 #ifdef FD_SETSIZE#undef FD_SETSIZE#endif#define FD_SETSIZE 65536 void do_accept(evutil_socket_t listener, short event, void *arg);void read_cb(struct bufferevent *bev, void *arg);void error_cb(struct bufferevent *bev, short event, void *arg);void write_cb(struct bufferevent *bev, void *arg); int main(int argc, char *argv[]){ int ret; evutil_socket_t listener; listener = socket(AF_INET, SOCK_STREAM, 0); assert(listener > 0); evutil_make_listen_socket_reuseable(listener); struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; sin.sin_port = htons(LISTEN_PORT); if (bind(listener, (struct sockaddr *)&sin, sizeof(sin)) < 0) { perror("bind"); return 1; } if (listen(listener, LISTEN_BACKLOG) < 0) { perror("listen"); return 1; } printf ("Listening...\n"); evutil_make_socket_nonblocking(listener); struct event_base *base = event_base_new(); assert(base != NULL); struct event *listen_event; listen_event = event_new(base, listener, EV_READ|EV_PERSIST, do_accept, (void*)base); event_add(listen_event, NULL); event_base_dispatch(base); printf("The End."); return 0;} void do_accept(evutil_socket_t listener, short event, void *arg){ struct event_base *base = (struct event_base *)arg; evutil_socket_t fd; struct sockaddr_in sin; socklen_t slen; fd = accept(listener, (struct sockaddr *)&sin, &slen); if (fd < 0) { perror("accept"); return; } if (fd > FD_SETSIZE) { perror("fd > FD_SETSIZE\n"); return; } printf("ACCEPT: fd = %u\n", fd); struct bufferevent *bev = bufferevent_socket_new(base, fd, BEV_OPT_CLOSE_ON_FREE); bufferevent_setcb(bev, read_cb, NULL, error_cb, arg); bufferevent_enable(bev, EV_READ|EV_WRITE|EV_PERSIST);} void read_cb(struct bufferevent *bev, void *arg){#define MAX_LINE 256 char line[MAX_LINE+1]; int n; evutil_socket_t fd = bufferevent_getfd(bev); while (n = bufferevent_read(bev, line, MAX_LINE), n > 0) { line[n] = '\0'; //printf("fd=%u, read line: %s\n", fd, line); bufferevent_write(bev, line, n); }} void write_cb(struct bufferevent *bev, void *arg) {} void error_cb(struct bufferevent *bev, short event, void *arg){ evutil_socket_t fd = bufferevent_getfd(bev); printf("fd = %u, ", fd); if (event & BEV_EVENT_TIMEOUT) { printf("Timed out\n"); //if bufferevent_set_timeouts() called } else if (event & BEV_EVENT_EOF) { printf("connection closed\n"); } else if (event & BEV_EVENT_ERROR) { printf("some other error\n"); } bufferevent_free(bev);}

tsvr.py ----------------------------------------------------------- import sys, time, random, socket, traceback from twisted.internet import epollreactor epollreactor.install() from twisted.internet import defer, reactor, task from twisted.internet.protocol import Protocol, Factory from protocol import TCPServerProtocol def main(): tcpprotocol = TCPServerProtocol factory = Factory() factory.protocol = tcpprotocol reactor.listenTCP(9976, factory) reactor.run() if __name__ == '__main__': main() protocol.py --------------------------------------------------------- import socket import datetime import traceback from twisted.protocols.basic import LineReceiver from twisted.internet import protocol class TCPServerProtocol(LineReceiver): req_count = 0 req_time = datetime.datetime.now() def lineReceived(self, data): TCPServerProtocol.req_count+=1 if TCPServerProtocol.req_count%10000==0: ct = datetime.datetime.now() dt = ct-TCPServerProtocol.req_time pps = 10000/(dt.seconds+dt.microseconds/1000000.0) TCPServerProtocol.req_time=ct print('RPS='+str(pps)) try: #self.transport.write(data) self.transport.getHandle().send(data) except: traceback.print_exc() tcli.py ----------------------------------------------------------------- import sys import socket import traceback import time import datetime host = 'localhost' port = 9976 loopcount = 300 sockcount = 5000 RPS = 4000 ss=[] for x in xrange(sockcount): ss.append(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) ss[x].connect((host, port)) ss[x].settimeout(120) for x in xrange(10000000): st = datetime.datetime.now() for y in xrange(loopcount): try: if ss[x%sockcount]!=None: ss[x%sockcount].sendall('1234567890\r\n') ss[x%sockcount].recv(1024) except: print y sys.exit() time.sleep(0.1) dt = (datetime.datetime.now()-st) plc = loopcount/(dt.seconds+dt.microseconds/1000000.0) print loopcount/(dt.seconds+dt.microseconds/1000000.0) #auto adjust RPS if plc<RPS: if RPS-plc>50: loopcount+=10 else: if plc-RPS>50: loopcount-=10 echosvr.c ---------------------------------------------------------------------- #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <assert.h> #include <event2/event.h> #include <event2/bufferevent.h> #define LISTEN_PORT 9976 #define LISTEN_BACKLOG 32 #ifdef FD_SETSIZE #undef FD_SETSIZE #endif #define FD_SETSIZE 65536 void do_accept(evutil_socket_t listener, short event, void *arg); void read_cb(struct bufferevent *bev, void *arg); void error_cb(struct bufferevent *bev, short event, void *arg); void write_cb(struct bufferevent *bev, void *arg); int main(int argc, char *argv[]) { int ret; evutil_socket_t listener; listener = socket(AF_INET, SOCK_STREAM, 0); assert(listener> 0); evutil_make_listen_socket_reuseable(listener); struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; sin.sin_port = htons(LISTEN_PORT); if (bind(listener, (struct sockaddr *)&sin, sizeof(sin)) < 0) { perror("bind"); return 1; } if (listen(listener, LISTEN_BACKLOG) < 0) { perror("listen"); return 1; } printf ("Listening...\n"); evutil_make_socket_nonblocking(listener); struct event_base *base = event_base_new(); assert(base != NULL); struct event *listen_event; listen_event = event_new(base, listener, EV_READ|EV_PERSIST, do_accept, (void*)base); event_add(listen_event, NULL); event_base_dispatch(base); printf("The End."); return 0; } void do_accept(evutil_socket_t listener, short event, void *arg) { struct event_base *base = (struct event_base *)arg; evutil_socket_t fd; struct sockaddr_in sin; socklen_t slen; fd = accept(listener, (struct sockaddr *)&sin, &slen); if (fd < 0) { perror("accept"); return; } if (fd> FD_SETSIZE) { perror("fd> FD_SETSIZE\n"); return; } printf("ACCEPT: fd = %u\n", fd); struct bufferevent *bev = bufferevent_socket_new(base, fd, BEV_OPT_CLOSE_ON_FREE); bufferevent_setcb(bev, read_cb, NULL, error_cb, arg); bufferevent_enable(bev, EV_READ|EV_WRITE|EV_PERSIST); } void read_cb(struct bufferevent *bev, void *arg) { #define MAX_LINE 256 char line[MAX_LINE+1]; int n; evutil_socket_t fd = bufferevent_getfd(bev); while (n = bufferevent_read(bev, line, MAX_LINE), n> 0) { line[n] = '\0'; //printf("fd=%u, read line: %s\n", fd, line); bufferevent_write(bev, line, n); } } void write_cb(struct bufferevent *bev, void *arg) {} void error_cb(struct bufferevent *bev, short event, void *arg) { evutil_socket_t fd = bufferevent_getfd(bev); printf("fd = %u, ", fd); if (event & BEV_EVENT_TIMEOUT) { printf("Timed out\n"); //if bufferevent_set_timeouts() called } else if (event & BEV_EVENT_EOF) { printf("connection closed\n"); } else if (event & BEV_EVENT_ERROR) { printf("some other error\n"); } bufferevent_free(bev); }

On 08:54 am, zipxing@hotmail.com wrote:
A simple TCP echo server using epoll reactor:this server process take 60% cpu on 4000 request/s. If use self.transport.getHandle().send instead of self.transport.write,it take 30% cpu on 4000 request/s. Why transport.write take more user cpu?Why twisted performance so poor?
`transport.write` will actually try to deliver your data. `send` will as likely drop it on the floor as deliver it. Correctness sometimes costs a little more CPU time. Jean-Paul

On 30/07/13 13:55, Laurens Van Houtven wrote:
On Tue, Jul 30, 2013 at 2:44 PM, Phil Mayers <p.mayers@imperial.ac.uk <mailto:p.mayers@imperial.ac.uk>> wrote:
For TCP?
Yes. See also the difference between socket.send and socket.sendall :)
socket.send returns a number of bytes that it actually sent.
Sure - FD writes don't, in general, have to take your entire buffer (or any of it). But once accepted, the data is no more or less likely to be dropped than data sent transport.write Obviously if the OP using "send" without checking the return value, they're doing it wrong!

Hi zipxing, You don't mention the interpreter. Is it CPython? What kind of results do you get trying it on PyPy? Also, you don't need to specify epollreactor. Recent versions of twisted will automagically choose the appropriate backend. I fixed this and some other cleanups and got: ---- from time import clock from twisted.internet import protocol, reactor from twisted.protocols import basic class MeasuringEchoProtocol(basic.LineReceiver): MEASUREMENT_INTERVAL = 1000 def lineReceived(self, data): self.factory.requests += 1 if self.factory.requests % self.MEASUREMENT_INTERVAL == 0: print "RPS: {0}".format(self.factory.requests / clock()) self.transport.write(data) class ServerFactory(protocol.ServerFactory): protocol = MeasuringEchoProtocol def __init__(self): self.requests = 0 def main(): reactor.listenTCP(9976, ServerFactory()) clock() reactor.run() if __name__ == '__main__': main() ---- Keep in mind that due to setup time few requests get handled right when it starts, so the server RPS will take some time to balance out. On my wimpy laptop on battery power, that was around 420.274404782. cheers lvh

Great! pypy take 20% cpu, 3x fast than cpython! Thanks! A little problem: cjson and ujson can't install to pypy... ZipXing ________________________________
From: _@lvh.io Date: Tue, 30 Jul 2013 13:09:33 +0200 To: twisted-python@twistedmatrix.com Subject: Re: [Twisted-Python] transport.write performance.
Hi zipxing,
You don't mention the interpreter. Is it CPython? What kind of results do you get trying it on PyPy?
Also, you don't need to specify epollreactor. Recent versions of twisted will automagically choose the appropriate backend. I fixed this and some other cleanups and got:
---- from time import clock from twisted.internet import protocol, reactor from twisted.protocols import basic
class MeasuringEchoProtocol(basic.LineReceiver): MEASUREMENT_INTERVAL = 1000
def lineReceived(self, data): self.factory.requests += 1 if self.factory.requests % self.MEASUREMENT_INTERVAL == 0: print "RPS: {0}".format(self.factory.requests / clock())
self.transport.write(data)
class ServerFactory(protocol.ServerFactory): protocol = MeasuringEchoProtocol
def __init__(self): self.requests = 0
def main(): reactor.listenTCP(9976, ServerFactory()) clock() reactor.run()
if __name__ == '__main__': main() ----
Keep in mind that due to setup time few requests get handled right when it starts, so the server RPS will take some time to balance out. On my wimpy laptop on battery power, that was around 420.274404782.
cheers lvh
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python

On Jul 31, 2013 4:19 AM, "zipxing" <zipxing@hotmail.com> wrote:
Great! pypy take 20% cpu, 3x fast than cpython! Thanks!
A little problem: cjson and ujson can't install to pypy...
What's wrong with the builtin json module?
ZipXing
________________________________
From: _@lvh.io Date: Tue, 30 Jul 2013 13:09:33 +0200 To: twisted-python@twistedmatrix.com Subject: Re: [Twisted-Python] transport.write performance.
Hi zipxing,
You don't mention the interpreter. Is it CPython? What kind of results do you get trying it on PyPy?
Also, you don't need to specify epollreactor. Recent versions of twisted will automagically choose the appropriate backend. I fixed this and some other cleanups and got:
---- from time import clock from twisted.internet import protocol, reactor from twisted.protocols import basic
class MeasuringEchoProtocol(basic.LineReceiver): MEASUREMENT_INTERVAL = 1000
def lineReceived(self, data): self.factory.requests += 1 if self.factory.requests % self.MEASUREMENT_INTERVAL == 0: print "RPS: {0}".format(self.factory.requests / clock())
self.transport.write(data)
class ServerFactory(protocol.ServerFactory): protocol = MeasuringEchoProtocol
def __init__(self): self.requests = 0
def main(): reactor.listenTCP(9976, ServerFactory()) clock() reactor.run()
if __name__ == '__main__': main() ----
Keep in mind that due to setup time few requests get handled right when it starts, so the server RPS will take some time to balance out. On my wimpy laptop on battery power, that was around 420.274404782.
cheers lvh
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python
Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python

pypy+builtin_json performance < cpython+ultrajson ________________________________
Date: Wed, 31 Jul 2013 09:22:53 +0200 From: _@lvh.io To: twisted-python@twistedmatrix.com Subject: Re: [Twisted-Python] transport.write performance.
On Jul 31, 2013 4:19 AM, "zipxing" <zipxing@hotmail.com<mailto:zipxing@hotmail.com>> wrote:
Great! pypy take 20% cpu, 3x fast than cpython! Thanks!
A little problem: cjson and ujson can't install to pypy...
What's wrong with the builtin json module?
ZipXing
________________________________
From: _@lvh.io<http://lvh.io> Date: Tue, 30 Jul 2013 13:09:33 +0200 To: twisted-python@twistedmatrix.com<mailto:twisted-python@twistedmatrix.com> Subject: Re: [Twisted-Python] transport.write performance.
Hi zipxing,
You don't mention the interpreter. Is it CPython? What kind of results do you get trying it on PyPy?
Also, you don't need to specify epollreactor. Recent versions of twisted will automagically choose the appropriate backend. I fixed this and some other cleanups and got:
---- from time import clock from twisted.internet import protocol, reactor from twisted.protocols import basic
class MeasuringEchoProtocol(basic.LineReceiver): MEASUREMENT_INTERVAL = 1000
def lineReceived(self, data): self.factory.requests += 1 if self.factory.requests % self.MEASUREMENT_INTERVAL == 0: print "RPS: {0}".format(self.factory.requests / clock())
self.transport.write(data)
class ServerFactory(protocol.ServerFactory): protocol = MeasuringEchoProtocol
def __init__(self): self.requests = 0
def main(): reactor.listenTCP(9976, ServerFactory()) clock() reactor.run()
if __name__ == '__main__': main() ----
Keep in mind that due to setup time few requests get handled right when it starts, so the server RPS will take some time to balance out. On my wimpy laptop on battery power, that was around 420.274404782.
cheers lvh
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com<mailto:Twisted-Python@twistedmatrix.com> http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python
Twisted-Python mailing list Twisted-Python@twistedmatrix.com<mailto:Twisted-Python@twistedmatrix.com> http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python

Hi Glyph, On 8/1/13 11:46 AM, Glyph wrote:
On Aug 1, 2013, at 1:49 AM, zipxing <zipxing@hotmail.com <mailto:zipxing@hotmail.com>> wrote:
pypy+builtin_json performance < cpython+ultrajson
Do you have a reference to a benchmark confirming this? I'm curious about it.
-glyph
Here's one: http://liangnuren.wordpress.com/2012/08/13/python-json-performance/ - L. Daniel Burr

tjson.py -------------------------------------------------------------------------- import json #for pypy #import ujson as json #for python tdic = {"aaa":1, "bbb":2, "ccc":3, "ddd":['1','2','3','4','abcdefghijk'], "eee":{"aaaa":1, "bbbb":2, "cccc":[1,2,3,4,5,'aaa']}} for x in xrange(1000000): a = json.dumps(tdic) b = json.loads(a) On my computer, 1M dumps&loads, using 6s with cpython+ultrajson 30s with pypy+json ________________________________
From: glyph@twistedmatrix.com Date: Thu, 1 Aug 2013 09:46:54 -0700 To: twisted-python@twistedmatrix.com Subject: Re: [Twisted-Python] transport.write performance.
On Aug 1, 2013, at 1:49 AM, zipxing <zipxing@hotmail.com<mailto:zipxing@hotmail.com>> wrote:
pypy+builtin_json performance < cpython+ultrajson
Do you have a reference to a benchmark confirming this? I'm curious about it.
-glyph
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python

cpython+ujson 6s cpython+cjson 9s pypy+json 30s ----------------------------------------
From: zipxing@hotmail.com To: twisted-python@twistedmatrix.com Date: Mon, 5 Aug 2013 19:26:40 +0800 Subject: Re: [Twisted-Python] transport.write performance.
tjson.py -------------------------------------------------------------------------- import json #for pypy
#import ujson as json #for python
tdic = {"aaa":1, "bbb":2, "ccc":3, "ddd":['1','2','3','4','abcdefghijk'], "eee":{"aaaa":1, "bbbb":2, "cccc":[1,2,3,4,5,'aaa']}}
for x in xrange(1000000): a = json.dumps(tdic) b = json.loads(a)
On my computer, 1M dumps&loads, using 6s with cpython+ultrajson 30s with pypy+json
________________________________
From: glyph@twistedmatrix.com Date: Thu, 1 Aug 2013 09:46:54 -0700 To: twisted-python@twistedmatrix.com Subject: Re: [Twisted-Python] transport.write performance.
On Aug 1, 2013, at 1:49 AM, zipxing <zipxing@hotmail.com<mailto:zipxing@hotmail.com>> wrote:
pypy+builtin_json performance < cpython+ultrajson
Do you have a reference to a benchmark confirming this? I'm curious about it.
-glyph
_______________________________________________ Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python
Twisted-Python mailing list Twisted-Python@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python

On Aug 1, 2013, at 1:49 AM, zipxing <zipxing@hotmail.com> wrote:
pypy+builtin_json performance < cpython+ultrajson
Just curious here: what version of PyPy did you do your tests with? From what I've heard, the somewhat recently-released 2.1 should have improved JSON parsing performance. -glyph

On Wed, Aug 7, 2013 at 9:58 PM, Glyph <glyph@twistedmatrix.com> wrote:
Just curious here: what version of PyPy did you do your tests with? From what I've heard, the somewhat recently-released 2.1 should have improved JSON parsing performance.
FWIW, Maciej commented that they don't expect to be *faster*, because they actually care about the JSON spec or checking return values. Things which ultrajson conveniently ignores :-) It's pretty easy to do the wrong thing fast, but that seems to be a common theme in this thread. lvh

On Aug 7, 2013, at 1:06 PM, Laurens Van Houtven <_@lvh.io> wrote:
FWIW, Maciej commented that they don't expect to be *faster*, because they actually care about the JSON spec or checking return values. Things which ultrajson conveniently ignores :-) It's pretty easy to do the wrong thing fast, but that seems to be a common theme in this thread.
I just said "improved", i.e. 2.1 ought to be faster than 2.0, no claims relative to anything else :). I'm aware that ultrajson is pretty bad, but cjson had comparable (if slightly slower) performance. Does it also have correctness and security problems? -glyph

On Wed, Aug 7, 2013 at 10:10 PM, Glyph <glyph@twistedmatrix.com> wrote:
On Aug 7, 2013, at 1:06 PM, Laurens Van Houtven <_@lvh.io> wrote:
FWIW, Maciej commented that they don't expect to be *faster*, because they actually care about the JSON spec or checking return values. Things which ultrajson conveniently ignores :-) It's pretty easy to do the wrong thing fast, but that seems to be a common theme in this thread.
I just said "improved", i.e. 2.1 ought to be faster than 2.0, no claims relative to anything else :).
I'm aware that ultrajson is pretty bad, but cjson had comparable (if slightly slower) performance. Does it also have correctness and security problems?
It's also important to note that in a microbenchmark, JIT warmup can be a significant factor in PyPy. If you're interested in the performance of short-running applications (eg. command-line tools) then this is very relevant, but for a long-running server process you're unlikely to care about the warmup overhead but rather the sustained performance thereafter. -- mithrandi, i Ainil en-Balandor, a faer Ambar

Perhaps it's time to move this thread to the PyPy or *json list? Thanks, Jean-Paul
participants (7)
-
exarkun@twistedmatrix.com
-
Glyph
-
L. Daniel Burr
-
Laurens Van Houtven
-
Phil Mayers
-
Tristan Seligmann
-
zipxing