[Pythonmac-SIG] Re: [HELP!] PC Line Endings? Batch file replacement

Mon, 11 Mar 2002 09:30:04 -0800

This is a multi-part message in MIME format.
--------------E6C62C8F6E2E246705BDC51B
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Dean,

your utility looks great.

My only suggestion is something you've already talked about: converting
spaces->tabs and tabs-> spaces while preserving Python symantics.  Since
this utility is designed to work with Python files, there is absolutely
no point in doing it any other way.

The code to do it right is not as trivial as I thought when I first
started to write it, but then I found a converter on the net that I
believe was written by Gordan McMillian. He used the tokenize module to
do the heavy lifting, and this guarantees that it is does the way the
Python interpreter would do it. I've enclosed it with this email. You
should be able to do a quick cut & paste of some of this code into your
utility, and have what you want.

Note that this code allows convertion to mixed tabs and spaces...please
don't allow that option in your version!

-Chris

-- 
Christopher Barker, Ph.D.
Oceanographer

NOAA/OR&R/HAZMAT         (206) 526-6959   voice
7600 Sand Point Way NE   (206) 526-6329   fax
Seattle, WA  98115       (206) 526-6317   main reception

Chris.Barker@noaa.gov
--------------E6C62C8F6E2E246705BDC51B
Content-Type: text/plain; charset=us-ascii;
 name="tabcleaner.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="tabcleaner.py"

#!/usr/bin/python

import tokenize
import string

TABSONLY = 'TABSONLY'
SPACESONLY = 'SPACESONLY'
MIXED = 'MIXED'

class PyText:
	def __init__(self, fnm, optdict):
		self.optdict = optdict
		self.fnm = fnm
		self.txt = open(self.fnm, 'r').readlines()
		self.indents = [(0, 0, )]
		self.lnndx = 0
		self.indentndx = 0
	def getline(self):
		if self.lnndx < len(self.txt):
			txt = self.txt[self.lnndx]
			self.lnndx = self.lnndx + 1
		else:
			txt = ''
		return txt
	def tokeneater(self, type, token, start, end, line):
		if type == tokenize.INDENT:
			(lvl, s) = self.indents[-1]
			self.indents[-1] = (lvl, s, start[0]-1)
			self.indents.append((lvl+1, start[0]-1,))
		elif type == tokenize.DEDENT:
			(lvl, s) = self.indents[-1]
			self.indents[-1] = (lvl, s, start[0]-1)
			self.indents.append((lvl-1, start[0]-1,))
		elif type == tokenize.ENDMARKER:
			(lvl, s) = self.indents[-1]
			self.indents[-1] = (lvl, s, len(self.txt))
	def split(self, ln):
		content = string.lstrip(ln)
		if not content:
			return ('', '\n')
		lead = ln[:len(ln) - len(content)]
		lead = string.expandtabs(lead)
		return (lead, content)

	def process(self):
		style = self.optdict.get('style', TABSONLY)
		indent = string.atoi(self.optdict.get('indent', '4'))
		tabsz = string.atoi(self.optdict.get('tabs', '8'))
		print 'file %s -> style %s, tabsize %d, indent %d' % (self.fnm, style, tabsz, indent)
		tokenize.tokenize(self.getline, self.tokeneater)
		#import pprint
		#pprint.pprint(self.indents)
		new = []
		for (lvl, s, e) in self.indents:
			if s >= len(self.txt):
				break
			if s == e:
				continue
			oldlead, content = self.split(self.txt[s])
			#print "oldlead", len(oldlead), `oldlead`
			if style == TABSONLY:
				newlead = '\t'*lvl
			elif style == SPACESONLY:
				newlead = ' '*(indent*lvl)
			else:
				sz = indent*lvl
				t,spcs = divmod(sz, tabsz)
				newlead = '\t'*t + ' '*spcs
			new.append(newlead + content)
			for ln in self.txt[s+1:e]:
				lead, content = self.split(ln)
				#print "lead:", len(lead)
				new.append(newlead + lead[len(oldlead):] + content)
		self.save(new)
		#print "---", self.fnm
		#for ln in new:
		#    print ln,
		#print

	def save(self, txt):
		bakname = os.path.splitext(self.fnm)[0]+'.bak'
		print "backing up", self.fnm, "to", bakname
		#print os.getcwd()
		try:
			os.rename(self.fnm, bakname)
		except os.error:
			os.remove(bakname)
			os.rename(self.fnm, bakname)
		open(self.fnm, 'w').writelines(txt)

def test():
	tc = PyText('test1.py')
	tc.process()
	tc = PyText('test1.py')
	tc.process(style=TABSONLY)
	tc = PyText('test1.py')
	tc.process(style=MIXED, indent=4, tabs=8)
	tc = PyText('test1.py')
	tc.process(style=MIXED, indent=2, tabs=8)

def cleanfile(fnm, d):
	if os.path.isdir(fnm) and not os.path.islink(fnm):
		names = os.listdir(fnm)
		for name in names:
			fullnm = os.path.join(fnm, name)
			if (os.path.isdir(fullnm) and not os.path.islink(fullnm)) or \
			    os.path.normcase(fullnm[-3:]) == ".py":
				cleanfile(fullnm, d)
		return
	tc = PyText(fnm, d)
	tc.process()

usage="""\
%s [options] [path...]
 options
  -T : reformat to TABS ONLY
  -S : reformat to SPACES ONLY ( -i option is important)
  -M : reformat to MIXED SPACES / TABS ( -t and -i options important)
  -t<n> : tab is worth <n> characters
  -i<n> : indents should be <n> characters 
  -h : print this text
 path is file or directory
"""
if __name__ == '__main__':
	import sys, getopt, os
	opts, args = getopt.getopt(sys.argv[1:], "TSMht:i:")
	d = {}
	print `opts`
	for opt in opts:
		if opt[0] == '-T':
			d['style'] = TABSONLY
		elif opt[0] == '-S':
			d['style'] = SPACESONLY
		elif opt[0] == '-M':
			d['style'] = MIXED
		elif opt[0] == '-t':
			d['tabs'] = opt[1]
		elif opt[0] == '-i':
			d['indent'] = opt[1]
		elif opt[0] == '-h':
			print usage % sys.argv[0]
			sys.exit(0)
	if not args:
		print usage % sys.argv[0]
	for arg in args:
		cleanfile(arg, d)

--------------E6C62C8F6E2E246705BDC51B--