How can I change the HTTP request to avoid gzip

I have a HTTP Proxy made with twisted.web and want to change the request that the browser sends to the Proxy such that I erase the value of the 'accept-encoding' key from 'gzip,deflate' to ' '.
I use the example from the Tisted Book:
By adding the overriden process method in WordCountProxyRequest I can get the request header but have found no way to set a key, value pair. I want make the server think that the browser does not support gzip because twisted seems to not support gzip as the response from www.google.com and many (but not all) sites appears still encoded. www.dpreview.com seems not to gzip the response, and so the resonse is processed correctly.
What can I do to either correctly decode gzip responses or modify the 'accept-encoding' value to nothing so the server does not compress the response?
Thank you! *Example 4-8. wordcountproxy.py*
import sgmllib, re from twisted.web import proxy, http import sys from twisted.python import log log.startLogging(sys.stdout)
WEB_PORT = 8000 PROXY_PORT = 8001
class WordParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.chardata = [] self.inBody = False
def start_body(self, attrs): self.inBody = True
def end_body(self): self.inBody = False
def handle_data(self, data): if self.inBody: self.chardata.append(data)
def getWords(self): # extract words wordFinder = re.compile(r'\w*') words = wordFinder.findall("".join(self.chardata)) words = filter(lambda word: word.strip( ), words) print "WORDS ARE", words return words
class WordCounter(object): ignoredWords = "the a of in from to this that and or but is was be can could i you they we at".split( )
def __init__(self): self.words = {}
def addWords(self, words): for word in words: word = word.lower( ) if not word in self.ignoredWords: currentCount = self.words.get(word, 0) self.words[word] = currentCount + 1
class WordCountProxyClient(proxy.ProxyClient): def handleHeader(self, key, value): proxy.ProxyClient.handleHeader(self, key, value) if key.lower( ) == "content-type": if value.split(';')[0] == 'text/html': self.parser = WordParser( )
def handleResponsePart(self, data): proxy.ProxyClient.handleResponsePart(self, data) if hasattr(self, 'parser'): self.parser.feed(data)
def handleResponseEnd(self): proxy.ProxyClient.handleResponseEnd(self) if hasattr(self, 'parser'): self.parser.close( ) self.father.wordCounter.addWords(self.parser.getWords( )) del(self.parser)
class WordCountProxyClientFactory(proxy.ProxyClientFactory): def buildProtocol(self, addr): client = proxy.ProxyClientFactory.buildProtocol(self, addr) # upgrade proxy.proxyClient object to WordCountProxyClient client.__class__ = WordCountProxyClient return client
class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory}
def __init__(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest.__init__(self, *args)
* def process(self): proxy.ProxyRequest.process(self) print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
class WordCountProxy(proxy.Proxy): def __init__(self, wordCounter): self.wordCounter = wordCounter proxy.Proxy.__init__(self)
def requestFactory(self, *args): return WordCountProxyRequest(self.wordCounter, *args)
class WordCountProxyFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): protocol = WordCountProxy(self.wordCounter) return protocol
# classes for web reporting interface class WebReportRequest(http.Request): def __init__(self, wordCounter, *args): self.wordCounter = wordCounter http.Request.__init__(self, *args)
def process(self): self.setHeader("Content-Type", "text/html") words = self.wordCounter.words.items( ) words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) for word, count in words: self.write("<li>%s %s</li>" % (word, count)) self.finish( )
class WebReportChannel(http.HTTPChannel): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPChannel.__init__(self)
def requestFactory(self, *args): return WebReportRequest(self.wordCounter, *args)
class WebReportFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): return WebReportChannel(self.wordCounter)
if __name__ == "__main__": from twisted.internet import reactor counter = WordCounter( ) prox = WordCountProxyFactory(counter) reactor.listenTCP(PROXY_PORT, prox) reactor.listenTCP(WEB_PORT, WebReportFactory(counter)) reactor.run( )

On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin radudragusin@gmail.com wrote:
I have a HTTP Proxy made with twisted.web and want to change the request that the browser sends to the Proxy such that I erase the value of the 'accept-encoding' key from 'gzip,deflate' to ' '.
I use the example from the Tisted Book:
By adding the overriden process method in WordCountProxyRequest I can get the request header but have found no way to set a key, value pair. I want make the server think that the browser does not support gzip because twisted seems to not support gzip as the response from www.google.com and many (but not all) sites appears still encoded. www.dpreview.com seems not to gzip the response, and so the resonse is processed correctly.
What can I do to either correctly decode gzip responses or modify the 'accept-encoding' value to nothing so the server does not compress the response?
Thank you! *Example 4-8. wordcountproxy.py*
import sgmllib, re from twisted.web import proxy, http import sys from twisted.python import log log.startLogging(sys.stdout)
WEB_PORT = 8000 PROXY_PORT = 8001
class WordParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.chardata = [] self.inBody = False
def start_body(self, attrs): self.inBody = True
def end_body(self): self.inBody = False
def handle_data(self, data): if self.inBody: self.chardata.append(data)
def getWords(self): # extract words wordFinder = re.compile(r'\w*') words = wordFinder.findall("".join(self.chardata)) words = filter(lambda word: word.strip( ), words) print "WORDS ARE", words return words
class WordCounter(object): ignoredWords = "the a of in from to this that and or but is was be can could i you they we at".split( )
def __init__(self): self.words = {}
def addWords(self, words): for word in words: word = word.lower( ) if not word in self.ignoredWords: currentCount = self.words.get(word, 0) self.words[word] = currentCount + 1
class WordCountProxyClient(proxy.ProxyClient): def handleHeader(self, key, value): proxy.ProxyClient.handleHeader(self, key, value)
How about skipping it here?
if key.lower( ) == "content-type": if value.split(';')[0] == 'text/html': self.parser = WordParser( )
def handleResponsePart(self, data): proxy.ProxyClient.handleResponsePart(self, data) if hasattr(self, 'parser'): self.parser.feed(data)
def handleResponseEnd(self): proxy.ProxyClient.handleResponseEnd(self) if hasattr(self, 'parser'): self.parser.close( ) self.father.wordCounter.addWords(self.parser.getWords( )) del(self.parser)
class WordCountProxyClientFactory(proxy.ProxyClientFactory): def buildProtocol(self, addr): client = proxy.ProxyClientFactory.buildProtocol(self, addr) # upgrade proxy.proxyClient object to WordCountProxyClient client.__class__ = WordCountProxyClient return client
class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory}
def __init__(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest.__init__(self, *args)
- def process(self): proxy.ProxyRequest.process(self) print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
class WordCountProxy(proxy.Proxy): def __init__(self, wordCounter): self.wordCounter = wordCounter proxy.Proxy.__init__(self)
def requestFactory(self, *args): return WordCountProxyRequest(self.wordCounter, *args)
class WordCountProxyFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): protocol = WordCountProxy(self.wordCounter) return protocol
# classes for web reporting interface class WebReportRequest(http.Request): def __init__(self, wordCounter, *args): self.wordCounter = wordCounter http.Request.__init__(self, *args)
def process(self): self.setHeader("Content-Type", "text/html") words = self.wordCounter.words.items( ) words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) for word, count in words: self.write("<li>%s %s</li>" % (word, count)) self.finish( )
class WebReportChannel(http.HTTPChannel): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPChannel.__init__(self)
def requestFactory(self, *args): return WebReportRequest(self.wordCounter, *args)
class WebReportFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): return WebReportChannel(self.wordCounter)
if __name__ == "__main__": from twisted.internet import reactor counter = WordCounter( ) prox = WordCountProxyFactory(counter) reactor.listenTCP(PROXY_PORT, prox) reactor.listenTCP(WEB_PORT, WebReportFactory(counter)) reactor.run( )
Jean-Paul

On Tue, Dec 16, 2008 at 2:20 PM, Jean-Paul Calderone exarkun@divmod.comwrote:
On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin radudragusin@gmail.com wrote:
I have a HTTP Proxy made with twisted.web and want to change the request that the browser sends to the Proxy such that I erase the value of the 'accept-encoding' key from 'gzip,deflate' to ' '.
I use the example from the Tisted Book:
By adding the overriden process method in WordCountProxyRequest I can get the request header but have found no way to set a key, value pair. I want make the server think that the browser does not support gzip because twisted seems to not support gzip as the response from www.google.com and many (but not all) sites appears still encoded. www.dpreview.com seems not to gzip the response, and so the resonse is processed correctly.
What can I do to either correctly decode gzip responses or modify the 'accept-encoding' value to nothing so the server does not compress the response?
Thank you! *Example 4-8. wordcountproxy.py*
import sgmllib, re from twisted.web import proxy, http import sys from twisted.python import log log.startLogging(sys.stdout)
WEB_PORT = 8000 PROXY_PORT = 8001
class WordParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.chardata = [] self.inBody = False
def start_body(self, attrs): self.inBody = True
def end_body(self): self.inBody = False
def handle_data(self, data): if self.inBody: self.chardata.append(data)
def getWords(self): # extract words wordFinder = re.compile(r'\w*') words = wordFinder.findall("".join(self.chardata)) words = filter(lambda word: word.strip( ), words) print "WORDS ARE", words return words
class WordCounter(object): ignoredWords = "the a of in from to this that and or but is was be can could i you they we at".split( )
def __init__(self): self.words = {}
def addWords(self, words): for word in words: word = word.lower( ) if not word in self.ignoredWords: currentCount = self.words.get(word, 0) self.words[word] = currentCount + 1
class WordCountProxyClient(proxy.ProxyClient): def handleHeader(self, key, value): proxy.ProxyClient.handleHeader(self, key, value)
How about skipping it here?
If I use here the following:
print "[", key, ":", value,"]"
I get: 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Cache-Control : no-cache, no-store, max-age=0, must-revalidate ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Pragma : no-cache ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Expires : Fri, 01 Jan 1990 00:00:00 GMT ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Type : text/javascript; charset=UTF-8 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_STAT_3492=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/ dragusin.ro ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_IMP=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/ dragusin.ro ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ X-Content-Type-Options : nosniff ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Length : 14340 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Server : GFE/1.3 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Connection : Close ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Cache-Control : private, max-age=0 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Expires : -1 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Type : text/html; charset=UTF-8 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Server : gws ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Length : 2597 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Connection : Close ]
So that is the response header. I need to override the request header, the one that the browser sends to the proxy server. See below:
if key.lower( ) == "content-type":
if value.split(';')[0] == 'text/html': self.parser = WordParser( )
def handleResponsePart(self, data): proxy.ProxyClient.handleResponsePart(self, data) if hasattr(self, 'parser'): self.parser.feed(data)
def handleResponseEnd(self): proxy.ProxyClient.handleResponseEnd(self) if hasattr(self, 'parser'): self.parser.close( ) self.father.wordCounter.addWords(self.parser.getWords( )) del(self.parser)
class WordCountProxyClientFactory(proxy.ProxyClientFactory): def buildProtocol(self, addr): client = proxy.ProxyClientFactory.buildProtocol(self, addr) # upgrade proxy.proxyClient object to WordCountProxyClient client.__class__ = WordCountProxyClient return client
class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory}
def __init__(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest.__init__(self, *args)
- def process(self): proxy.ProxyRequest.process(self) print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
the print above prints:
received_headers: {'accept-language': 'en-us,en;q=0.5', 'accept-encoding': 'gzip,deflate', 'keep-alive': '300', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.4) Gecko/2008111318 Ubuntu/8.10 (intrepid) Firefox/3.0.4', 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'host': 'www.google.com', 'cookie': 'PREF=ID=cfb3eb179de0c1e6:LD=en:NR=100:CR=2:TM=1228315308:LM=1229032156:GM=1:S=ImAuEufbnV6S7BAz; NID=17=lOVMiFLculcrfN-zUO7xxFTTUFzqQqaHOFHcG_BDmYFX8QKYbMoo7GrDoYH-8ASPBlVijG_Hstp7HSDQ_8WQexHPjwz6g_7ZVpBhwmh3vkKuO3jpf9dnzrnWthcW1mGh; S=photos_html=6ScUGfd699g4Xuuh0FeizA; TZ=-120', 'cache-control': 'max-age=0', 'proxy-connection': 'keep-alive'}
these are the values I want to modify, the 'accept-encoding', to be specific. How can I do it?
Thank you!
class WordCountProxy(proxy.Proxy):
def __init__(self, wordCounter): self.wordCounter = wordCounter proxy.Proxy.__init__(self)
def requestFactory(self, *args): return WordCountProxyRequest(self.wordCounter, *args)
class WordCountProxyFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): protocol = WordCountProxy(self.wordCounter) return protocol
# classes for web reporting interface class WebReportRequest(http.Request): def __init__(self, wordCounter, *args): self.wordCounter = wordCounter http.Request.__init__(self, *args)
def process(self): self.setHeader("Content-Type", "text/html") words = self.wordCounter.words.items( ) words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) for word, count in words: self.write("<li>%s %s</li>" % (word, count)) self.finish( )
class WebReportChannel(http.HTTPChannel): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPChannel.__init__(self)
def requestFactory(self, *args): return WebReportRequest(self.wordCounter, *args)
class WebReportFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): return WebReportChannel(self.wordCounter)
if __name__ == "__main__": from twisted.internet import reactor counter = WordCounter( ) prox = WordCountProxyFactory(counter) reactor.listenTCP(PROXY_PORT, prox) reactor.listenTCP(WEB_PORT, WebReportFactory(counter)) reactor.run( )
Jean-Paul
Twisted-web mailing list Twisted-web@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web

On Tue, 16 Dec 2008 15:57:18 +0200, Radu Dragusin radudragusin@gmail.com wrote:
On Tue, Dec 16, 2008 at 2:20 PM, Jean-Paul Calderone exarkun@divmod.comwrote:
On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin radudragusin@gmail.com wrote:
[snip]
class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory}
def __init__(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest.__init__(self, *args)
- def process(self): proxy.ProxyRequest.process(self) print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
the print above prints:
received_headers: {'accept-language': 'en-us,en;q=0.5', 'accept-encoding': 'gzip,deflate', 'keep-alive': '300', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.4) Gecko/2008111318 Ubuntu/8.10 (intrepid) Firefox/3.0.4', 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'host': 'www.google.com', 'cookie': 'PREF=ID=cfb3eb179de0c1e6:LD=en:NR=100:CR=2:TM=1228315308:LM=1229032156:GM=1:S=ImAuEufbnV6S7BAz; NID=17=lOVMiFLculcrfN-zUO7xxFTTUFzqQqaHOFHcG_BDmYFX8QKYbMoo7GrDoYH-8ASPBlVijG_Hstp7HSDQ_8WQexHPjwz6g_7ZVpBhwmh3vkKuO3jpf9dnzrnWthcW1mGh; S=photos_html=6ScUGfd699g4Xuuh0FeizA; TZ=-120', 'cache-control': 'max-age=0', 'proxy-connection': 'keep-alive'}
these are the values I want to modify, the 'accept-encoding', to be specific. How can I do it?
Request has a received_headers attribute which refers to a dictionary. You can delete the "accept" key from it. Make sure you do it before you call the base process method, though:
def process(self): try: del self.received_headers['accept'] except KeyError: pass proxy.ProxyRequest.process(self)
Or, in Twisted 8.2, you can use a slightly better headers API:
def process(self): self.requestHeaders.removeHeader('accept') proxy.ProxyRequest.process(self)
Jean-Paul

I deleted the 'accept-encoding' key as you said and it worked. No more gzip.
Thread solved. Thank you!
Request has a received_headers attribute which refers to a dictionary. You can delete the "accept" key from it. Make sure you do it before you call the base process method, though:
def process(self): try: del self.received_headers['accept'] except KeyError: pass proxy.ProxyRequest.process(self)
Or, in Twisted 8.2, you can use a slightly better headers API:
def process(self): self.requestHeaders.removeHeader('accept') proxy.ProxyRequest.process(self)
Jean-Paul
Twisted-web mailing list Twisted-web@twistedmatrix.com http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web
participants (2)
-
Jean-Paul Calderone
-
Radu Dragusin