On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <radudragusin@gmail.com> wrote:
I have a HTTP Proxy made with twisted.web and want to change the request that the browser sends to the Proxy such that I erase the value of the 'accept-encoding' key from 'gzip,deflate' to ' '.
I use the example from the Tisted Book:
By adding the overriden process method in WordCountProxyRequest I can get the request header but have found no way to set a key, value pair. I want make the server think that the browser does not support gzip because twisted seems to not support gzip as the response from www.google.com and many (but not all) sites appears still encoded. www.dpreview.com seems not to gzip the response, and so the resonse is processed correctly.
What can I do to either correctly decode gzip responses or modify the 'accept-encoding' value to nothing so the server does not compress the response?
Thank you! *Example 4-8. wordcountproxy.py*
import sgmllib, re from twisted.web import proxy, http import sys from twisted.python import log log.startLogging(sys.stdout)
WEB_PORT = 8000 PROXY_PORT = 8001
class WordParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.chardata = [] self.inBody = False
def start_body(self, attrs): self.inBody = True
def end_body(self): self.inBody = False
def handle_data(self, data): if self.inBody: self.chardata.append(data)
def getWords(self): # extract words wordFinder = re.compile(r'\w*') words = wordFinder.findall("".join(self.chardata)) words = filter(lambda word: word.strip( ), words) print "WORDS ARE", words return words
class WordCounter(object): ignoredWords = "the a of in from to this that and or but is was be can could i you they we at".split( )
def __init__(self): self.words = {}
def addWords(self, words): for word in words: word = word.lower( ) if not word in self.ignoredWords: currentCount = self.words.get(word, 0) self.words[word] = currentCount + 1
class WordCountProxyClient(proxy.ProxyClient): def handleHeader(self, key, value): proxy.ProxyClient.handleHeader(self, key, value)
How about skipping it here?
if key.lower( ) == "content-type": if value.split(';')[0] == 'text/html': self.parser = WordParser( )
def handleResponsePart(self, data): proxy.ProxyClient.handleResponsePart(self, data) if hasattr(self, 'parser'): self.parser.feed(data)
def handleResponseEnd(self): proxy.ProxyClient.handleResponseEnd(self) if hasattr(self, 'parser'): self.parser.close( ) self.father.wordCounter.addWords(self.parser.getWords( )) del(self.parser)
class WordCountProxyClientFactory(proxy.ProxyClientFactory): def buildProtocol(self, addr): client = proxy.ProxyClientFactory.buildProtocol(self, addr) # upgrade proxy.proxyClient object to WordCountProxyClient client.__class__ = WordCountProxyClient return client
class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory}
def __init__(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest.__init__(self, *args)
* def process(self): proxy.ProxyRequest.process(self) print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
class WordCountProxy(proxy.Proxy): def __init__(self, wordCounter): self.wordCounter = wordCounter proxy.Proxy.__init__(self)
def requestFactory(self, *args): return WordCountProxyRequest(self.wordCounter, *args)
class WordCountProxyFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): protocol = WordCountProxy(self.wordCounter) return protocol
# classes for web reporting interface class WebReportRequest(http.Request): def __init__(self, wordCounter, *args): self.wordCounter = wordCounter http.Request.__init__(self, *args)
def process(self): self.setHeader("Content-Type", "text/html") words = self.wordCounter.words.items( ) words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) for word, count in words: self.write("<li>%s %s</li>" % (word, count)) self.finish( )
class WebReportChannel(http.HTTPChannel): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPChannel.__init__(self)
def requestFactory(self, *args): return WebReportRequest(self.wordCounter, *args)
class WebReportFactory(http.HTTPFactory): def __init__(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory.__init__(self)
def buildProtocol(self, addr): return WebReportChannel(self.wordCounter)
if __name__ == "__main__": from twisted.internet import reactor counter = WordCounter( ) prox = WordCountProxyFactory(counter) reactor.listenTCP(PROXY_PORT, prox) reactor.listenTCP(WEB_PORT, WebReportFactory(counter)) reactor.run( )
Jean-Paul