Attempting to parse free-form ANSI text.

Mon Oct 23 08:56:35 EDT 2006

Paul McGuire wrote:
> "Michael B. Trausch" <"mike$#at^&nospam!%trauschus"> wrote in message 
> news:GsGdnTIYc-lXaafYnZ2dnUVZ_sCdnZ2d at comcast.com...
>   
>> Alright... I am attempting to find a way to parse ANSI text from a
>> telnet application.  However, I am experiencing a bit of trouble.
>>
>> What I want to do is have all ANSI sequences _removed_ from the output,
>> save for those that manage color codes or text presentation (in short,
>> the ones that are ESC[#m (with additional #s separated by ; characters).
>> The ones that are left, the ones that are the color codes, I want to
>> act on, and remove from the text stream, and display the text.
>>
>>     
> Here is a pyparsing-based scanner/converter, along with some test code at 
> the end.  It takes care of partial escape sequences, and strips any 
> sequences of the form
> "<ESC>[##;##;...<alpha>", unless the trailing alpha is 'm'.
> The pyparsing project wiki is at http://pyparsing.wikispaces.com.
>
> -- Paul
>
> from pyparsing import *
>
>   
snip
>  
>
> test = """\
> This is a test string containing some ANSI sequences.
> Sequence 1: ~[10;12m
> Sequence 2: ~[3;4h
> Sequence 3: ~[4;5m
> Sequence 4; ~[m
> Sequence 5; ~[24HNo more escape sequences.
> ~[7""".replace('~',chr(27))
>
> leftOver = processInputString(test)
>
>
> Prints:
> This is a test string containing some ANSI sequences.
> Sequence 1:
> <change color attributes to ['1012']>
>   
I doubt we should concatenate numbers.
> Sequence 2:
>
> Sequence 3:
> <change color attributes to ['45']>
>
> Sequence 4;
> <change color attributes to ['']>
>
> Sequence 5;
> No more escape sequences.
>
> <found partial escape sequence ['\x1b[7'], tack it on front of next>
>
>
>   
Another one of Paul's elegant pyparsing solutions. To satisfy my own 
curiosity, I tried to see how SE stacked up and devoted more time than I 
really should to finding out. In the end I don't know if it was worth 
the effort, but having made it I might as well just throw it in.

The following code does everything Mike needs to do, except interact 
with wx. It is written to run standing alone. To incorporate it in 
Mike's class the functions would be methods and the globals would be 
instance attributes. Running it does this:

 >>> chunk_1 = """This is a test string containing some ANSI sequences.
Sequence 1 Valid code, invalid numbers: \x1b[10;12mEnd of sequence 1
Sequence 2 Not an 'm'-code: \x1b[30;4;77hEnd of sequence 2
Sequence 3 Color setting code: \x1b[30;45mEnd of sequence 3
Sequence 4 Parameter setting code: \x1b[7mEnd of sequence 4
Sequence 5 Color setting code spanning calls: \x1b[3"""

 >>> chunk_2 = """7;42mEnd of sequence 5
Sequence 6 Invalid code: \x1b[End of sequence 6
Sequence 7 A valid code at the end: \x1b[9m
"""

 >>> init ()
 >>> process_text (chunk_1)
 >>> process_text (chunk_2)
 >>> print output

This is a test string containing some ANSI sequences.
Sequence 1 Valid code, invalid numbers:  >>! Ignoring unknown number 10 
!<<  >>! Ignoring unknown number 12 !<< End of sequence 1
Sequence 2 Not an 'm'-code: End of sequence 2
Sequence 3 Color setting code:  >>setting foreground BLACK<<  >>setting 
background MAGENTA<< End of sequence 3
Sequence 4 Parameter setting code:  >>Calling parameter setting function 
7<< End of sequence 4
Sequence 5 Color setting code spanning calls:  >>setting foreground 
GREY<<  >>setting background GREEN<< End of sequence 5
Sequence 6 Invalid code: nd of sequence 6
Sequence 7 A valid code at the end:  >>Calling parameter setting 
function 9<<

###################

And here it goes:

def init (): 

   # To add to AnsiTextCtrl.__init__ ()

   import SE   # SEL is less import overhead but doesn't have 
interactive development features (not needed in production versions)

   global output  #-> For testing
   global Pre_Processor, digits_re, Colors, truncated_escape_hold   # 
global -> instance attributes

   # Screening out all ansi escape sequences except those controlling color
   grit = '\n'.join (['(%d)=' % i for i in range (128,255)]) + ' (13)= '
   # Regular expression r'[\x80-\xff\r]' would work fine but is four 
times slower than 127 fixed definitions
   all_escapes   = r'\x1b\[\d*(;\d*)*[A-Za-z]'
   color_escapes = r'\x1b\[\d*(;\d*)*m'
   Pre_Processor = SE.SE ('%s ~%s~= ~%s~==' % (grit, all_escapes, 
color_escapes))  # SEL.SEL for production
   # 'all_escapes' also matches what 'color_escapes' matches. With 
identical regular expression matches it is the last definitions that 
applies. Other than that, the order of definitions is irrelevant to 
precedence.

   # Isolating digits.
   digits_re = re.compile ('\d+')

   # Color numbers to color names
   Colors = SE.SE ('''
       30=BLACK    40=BLACK
       31=RED      41=RED
       32=GREEN    42=GREEN
       33=YELLOW   43=YELLOW
       34=BLUE     44=BLUE
       35=MAGENTA  45=MAGENTA
       36=CYAN     46=CYAN
       37=GREY     47=GREY
       39=GREY     49=BLACK
       <EAT>
   ''')

   truncated_escape_hold = ''  #-> self.truncated_escape_hold
   output                = ''  #-> For testing only

# What follows replaces all others of Mike's methods

def process_text (text):

   global output  #-> For testing
   global truncated_escape_hold, digits_re, Pre_Processor, Colors

   purged_text = truncated_escape_hold + Pre_Processor (text)
   # Text is now clean except for color codes beginning with ESC

   ansi_controlled_sections = purged_text.split ('\x1b')
   # Each ansi_controlled_section starts with a color control, except 
the first one (leftmost split-off)

   if ansi_controlled_sections:
      #-> self.AppendText(ansi_controlled_sections [0])             #-> 
For real 
      output += ansi_controlled_sections [0]   #-> For testing      #-> 
For testing
      for section in ansi_controlled_sections [1:]:
         if section == '': continue
         try: escape_ansi_controlled_section, data = section.split ('m', 1)
         except ValueError:   # Truncated escape
            truncated_escape_hold = '\x1b' + section  # Restore ESC 
removed by split ('\x1b')
         else:
            escapes = escape_ansi_controlled_section.split (';')
            for escape in escapes:
               try: number = digits_re.search (escape).group ()
               except AttributeError:
                  output += ' >>!!!Invalid number %s!!!<<< ' % escape    
#-> For testing
                  continue
               _set_wx (number)
            #-> self.AppendText(data)     #-> For real 
            output += data                #-> For testing

def _set_wx (n):

   global output  # For testing only
   global Colors

   int_n = int (n)
   if 0 <= int_n <= 9:
      #-> self._number_to_method (n)()                              #-> 
For real
      output += ' >>Calling parameter setting function %s<< ' % n   #-> 
For testing
      return
   color = Colors (n)
   if color:
      if 30 <= int_n < 50:
         if 40 <= int_n:
            #-> self.AnsiBGColor = color                            #-> 
For real
            output += ' >>setting background %s<< ' % color         #-> 
For testing
         else:
            #-> self.AnsiFGColor = color                            #-> 
For real
            output += ' >>setting foreground %s<< ' % color         #-> 
For testing
         return
   output += ' >>!!!Ignoring unknown number %s!!!<< ' % n           #-> 
For testing

#-> For real requires this in addition:
#->
#->   # Methods controlled by 'm' code 0 to 9:  # Presumably 'm'?
#->
#->   def _0 (self):
#->      self.AnsiFGColor = 'GREY'
#->      self.AnsiBGColor = 'BLACK'
#->      self.AnsiFontSize = 9
#->      self.AnsiFontFamily = wx.FONTFAMILY_TELETYPE
#->      self.AnsiFontStyle = wx.FONTSTYLE_NORMAL
#->      self.AnsiFontWeight = wx.FONTWEIGHT_NORMAL
#->      self.AnsiFontUnderline = False
#->
#->   def  _1 (self): self.AnsiFontWeight = wx.FONTWEIGHT_BOLD
#->   def  _2 (self): self.AnsiFontWeight = wx.FONTWEIGHT_LIGHT
#->   def  _3 (self): self.AnsiFontStyle = wx.FONTSTYLE_ITALIC
#->   def  _4 (self): self.AnsiFontUnderline = True
#->   def  _5 (self): pass
#->   def  _7 (self): self.AnsiFGColor, self.AnsiBGColor = 
self.AnsiBGColor, self.AnsiFGColor
#->   def  _8 (self): self.AnsiFGColor = self.AnsiBGColor
#->   def  _9 (self): pass
#->
#->
#->   _number_to_method = {
#->      '0' : _0,
#->      '1' : _1,
#->      '2' : _2,
#->      '3' : _3,
#->      '4' : _4,
#->      '7' : _7,
#->      '8' : _8,
#->      '9' : _9,
#->   }

################

The most recent version of SE is now 2.3 with a rare malfunction 
corrected. (SE from http://cheeseshop.python.org/pypi/SE/2.2%20beta)