Mailman 3 How can I change the HTTP request to avoid gzip - Twisted-web

16 Dec 2008

      I have a HTTP Proxy made with twisted.web and want to change the request
that the browser sends to the Proxy such that I erase the value of the
'accept-encoding' key from 'gzip,deflate' to ' '.

I use the example from the Tisted Book:

By adding the overriden process method in WordCountProxyRequest I can get
the request header but have found no way to set a key, value pair.
I want make the server think that the browser does not support gzip because
twisted seems to not support gzip as the response from www.google.com and
many (but not all) sites appears still encoded. www.dpreview.com seems not
to gzip the response, and so the resonse is processed correctly.

What can I do to either correctly decode gzip responses or modify the
'accept-encoding' value to nothing so the server does not compress the
response?

Thank you!
*Example 4-8. wordcountproxy.py*

import sgmllib, re
from twisted.web import proxy, http
import sys
from twisted.python import log
log.startLogging(sys.stdout)

WEB_PORT = 8000
PROXY_PORT = 8001

class WordParser(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.chardata = []
        self.inBody = False

    def start_body(self, attrs):
        self.inBody = True

    def end_body(self):
        self.inBody = False

    def handle_data(self, data):
        if self.inBody:
            self.chardata.append(data)

    def getWords(self):
        # extract words
        wordFinder = re.compile(r'\w*')
        words = wordFinder.findall("".join(self.chardata))
        words = filter(lambda word: word.strip( ), words)
        print "WORDS ARE", words
        return words

class WordCounter(object):
    ignoredWords = "the a of in from to this that and or but is was be
can could i you they we at".split( )

    def __init__(self):
        self.words = {}

    def addWords(self, words):
        for word in words:
            word = word.lower( )
            if not word in self.ignoredWords:
                currentCount = self.words.get(word, 0)
                self.words[word] = currentCount + 1

class WordCountProxyClient(proxy.ProxyClient):
    def handleHeader(self, key, value):
        proxy.ProxyClient.handleHeader(self, key, value)
        if key.lower( ) == "content-type":
            if value.split(';')[0] == 'text/html':
                self.parser = WordParser( )

    def handleResponsePart(self, data):
        proxy.ProxyClient.handleResponsePart(self, data)
        if hasattr(self, 'parser'): self.parser.feed(data)

    def handleResponseEnd(self):
        proxy.ProxyClient.handleResponseEnd(self)
        if hasattr(self, 'parser'):
            self.parser.close( )
            self.father.wordCounter.addWords(self.parser.getWords( ))
            del(self.parser)

class WordCountProxyClientFactory(proxy.ProxyClientFactory):
    def buildProtocol(self, addr):
        client = proxy.ProxyClientFactory.buildProtocol(self, addr)
        # upgrade proxy.proxyClient object to WordCountProxyClient
        client.__class__ = WordCountProxyClient
        return client

class WordCountProxyRequest(proxy.ProxyRequest):
    protocols = {'http': WordCountProxyClientFactory}

    def __init__(self, wordCounter, *args):
        self.wordCounter = wordCounter
        proxy.ProxyRequest.__init__(self, *args)

*    def process(self):
        proxy.ProxyRequest.process(self)
        print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*

class WordCountProxy(proxy.Proxy):
    def __init__(self, wordCounter):
        self.wordCounter = wordCounter
        proxy.Proxy.__init__(self)

    def requestFactory(self, *args):
        return WordCountProxyRequest(self.wordCounter, *args)

class WordCountProxyFactory(http.HTTPFactory):
    def __init__(self, wordCounter):
        self.wordCounter = wordCounter
        http.HTTPFactory.__init__(self)

    def buildProtocol(self, addr):
        protocol = WordCountProxy(self.wordCounter)
        return protocol

# classes for web reporting interface
class WebReportRequest(http.Request):
    def __init__(self, wordCounter, *args):
        self.wordCounter = wordCounter
        http.Request.__init__(self, *args)

    def process(self):
        self.setHeader("Content-Type", "text/html")
        words = self.wordCounter.words.items( )
        words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1))
        for word, count in words:
            self.write("<li>%s %s</li>" % (word, count))
        self.finish( )

class WebReportChannel(http.HTTPChannel):
    def __init__(self, wordCounter):
        self.wordCounter = wordCounter
        http.HTTPChannel.__init__(self)

    def requestFactory(self, *args):
        return WebReportRequest(self.wordCounter, *args)

class WebReportFactory(http.HTTPFactory):
    def __init__(self, wordCounter):
        self.wordCounter = wordCounter
        http.HTTPFactory.__init__(self)

    def buildProtocol(self, addr):
        return WebReportChannel(self.wordCounter)

if __name__ == "__main__":
    from twisted.internet import reactor
    counter = WordCounter( )
    prox = WordCountProxyFactory(counter)
    reactor.listenTCP(PROXY_PORT, prox)
    reactor.listenTCP(WEB_PORT, WebReportFactory(counter))
    reactor.run( )

-- 
Radu

How can I change the HTTP request to avoid gzip

Radu Dragusin

Jean-Paul Calderone

Radu Dragusin

Jean-Paul Calderone

Radu Dragusin

tags

participants (2)