On Tue, Dec 16, 2008 at 2:20 PM, Jean-Paul Calderone <exarkun@divmod.com> wrote:
On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <radudragusin@gmail.com> wrote:
I have a HTTP Proxy made with twisted.web and want to change the request
that the browser sends to the Proxy such that I erase the value of the
'accept-encoding' key from 'gzip,deflate' to ' '.

I use the example from the Tisted Book:

By adding the overriden process method in WordCountProxyRequest I can get
the request header but have found no way to set a key, value pair.
I want make the server think that the browser does not support gzip because
twisted seems to not support gzip as the response from www.google.com and
many (but not all) sites appears still encoded. www.dpreview.com seems not
to gzip the response, and so the resonse is processed correctly.

What can I do to either correctly decode gzip responses or modify the
'accept-encoding' value to nothing so the server does not compress the
response?

Thank you!
*Example 4-8. wordcountproxy.py*

import sgmllib, re
from twisted.web import proxy, http
import sys
from twisted.python import log
log.startLogging(sys.stdout)

WEB_PORT = 8000
PROXY_PORT = 8001

class WordParser(sgmllib.SGMLParser):
  def __init__(self):
      sgmllib.SGMLParser.__init__(self)
      self.chardata = []
      self.inBody = False

  def start_body(self, attrs):
      self.inBody = True

  def end_body(self):
      self.inBody = False

  def handle_data(self, data):
      if self.inBody:
          self.chardata.append(data)

  def getWords(self):
      # extract words
      wordFinder = re.compile(r'\w*')
      words = wordFinder.findall("".join(self.chardata))
      words = filter(lambda word: word.strip( ), words)
      print "WORDS ARE", words
      return words

class WordCounter(object):
  ignoredWords = "the a of in from to this that and or but is was be
can could i you they we at".split( )

  def __init__(self):
      self.words = {}

  def addWords(self, words):
      for word in words:
          word = word.lower( )
          if not word in self.ignoredWords:
              currentCount = self.words.get(word, 0)
              self.words[word] = currentCount + 1

class WordCountProxyClient(proxy.ProxyClient):
  def handleHeader(self, key, value):
      proxy.ProxyClient.handleHeader(self, key, value)

How about skipping it here?
 
 
If I use here the following:

print "[", key, ":", value,"]"

I get:
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Cache-Control : no-cache, no-store, max-age=0, must-revalidate ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Pragma : no-cache ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Expires : Fri, 01 Jan 1990 00:00:00 GMT ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Type : text/javascript; charset=UTF-8 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_STAT_3492=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/dragusin.ro ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_IMP=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/dragusin.ro ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ X-Content-Type-Options : nosniff ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Length : 14340 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Server : GFE/1.3 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Connection : Close ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Cache-Control : private, max-age=0 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Expires : -1 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Type : text/html; charset=UTF-8 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Server : gws ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Length : 2597 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Connection : Close ]

So that is the response header.
I need to override the request header, the one that the browser sends to the proxy server.
See below:
 


      if key.lower( ) == "content-type":
          if value.split(';')[0] == 'text/html':
              self.parser = WordParser( )

  def handleResponsePart(self, data):
      proxy.ProxyClient.handleResponsePart(self, data)
      if hasattr(self, 'parser'): self.parser.feed(data)


  def handleResponseEnd(self):
      proxy.ProxyClient.handleResponseEnd(self)
      if hasattr(self, 'parser'):
          self.parser.close( )
          self.father.wordCounter.addWords(self.parser.getWords( ))
          del(self.parser)

class WordCountProxyClientFactory(proxy.ProxyClientFactory):
  def buildProtocol(self, addr):
      client = proxy.ProxyClientFactory.buildProtocol(self, addr)
      # upgrade proxy.proxyClient object to WordCountProxyClient
      client.__class__ = WordCountProxyClient
      return client

class WordCountProxyRequest(proxy.ProxyRequest):
  protocols = {'http': WordCountProxyClientFactory}

  def __init__(self, wordCounter, *args):
      self.wordCounter = wordCounter
      proxy.ProxyRequest.__init__(self, *args)

*    def process(self):
      proxy.ProxyRequest.process(self)
      print "received_headers", proxy.ProxyRequest.getAllHeaders(self)* 

the print above prints:
 
received_headers: {'accept-language': 'en-us,en;q=0.5', 'accept-encoding': 'gzip,deflate', 'keep-alive': '300', 'accept': 'text/html,application/xhtml+
xml,application/xml;q=0.9,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.4) Gecko/2008111318 Ubuntu/8.10 (intrepid) Firefox/3.0.4', 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'host': 'www.google.com', 'cookie': 'PREF=ID=cfb3eb179de0c1e6:LD=en:NR=100:CR=2:TM=1228315308:LM=1229032156:GM=1:S=ImAuEufbnV6S7BAz; NID=17=lOVMiFLculcrfN-zUO7xxFTTUFzqQqaHOFHcG_BDmYFX8QKYbMoo7GrDoYH-8ASPBlVijG_Hstp7HSDQ_8WQexHPjwz6g_7ZVpBhwmh3vkKuO3jpf9dnzrnWthcW1mGh; S=photos_html=6ScUGfd699g4Xuuh0FeizA; TZ=-120', 'cache-control': 'max-age=0', 'proxy-connection': 'keep-alive'}

these are the values I want to modify, the 'accept-encoding', to be specific. How can I do it?

Thank you!

 
class WordCountProxy(proxy.Proxy):
  def __init__(self, wordCounter):
      self.wordCounter = wordCounter
      proxy.Proxy.__init__(self)

  def requestFactory(self, *args):
      return WordCountProxyRequest(self.wordCounter, *args)

class WordCountProxyFactory(http.HTTPFactory):
  def __init__(self, wordCounter):
      self.wordCounter = wordCounter
      http.HTTPFactory.__init__(self)

  def buildProtocol(self, addr):
      protocol = WordCountProxy(self.wordCounter)
      return protocol

# classes for web reporting interface
class WebReportRequest(http.Request):
  def __init__(self, wordCounter, *args):
      self.wordCounter = wordCounter
      http.Request.__init__(self, *args)

  def process(self):
      self.setHeader("Content-Type", "text/html")
      words = self.wordCounter.words.items( )
      words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1))
      for word, count in words:
          self.write("<li>%s %s</li>" % (word, count))
      self.finish( )

class WebReportChannel(http.HTTPChannel):
  def __init__(self, wordCounter):
      self.wordCounter = wordCounter
      http.HTTPChannel.__init__(self)

  def requestFactory(self, *args):
      return WebReportRequest(self.wordCounter, *args)

class WebReportFactory(http.HTTPFactory):
  def __init__(self, wordCounter):
      self.wordCounter = wordCounter
      http.HTTPFactory.__init__(self)

  def buildProtocol(self, addr):
      return WebReportChannel(self.wordCounter)

if __name__ == "__main__":
  from twisted.internet import reactor
  counter = WordCounter( )
  prox = WordCountProxyFactory(counter)
  reactor.listenTCP(PROXY_PORT, prox)
  reactor.listenTCP(WEB_PORT, WebReportFactory(counter))
  reactor.run( )



Jean-Paul

_______________________________________________
Twisted-web mailing list
Twisted-web@twistedmatrix.com
http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web



--
Radu