[Python-checkins] python/dist/src/Parser tokenizer_pgen.c,NONE,2.1 parsetok.c,2.32,2.33 tokenizer.c,2.54,2.55 tokenizer.h,2.16,2.17

loewis@users.sourceforge.net loewis@users.sourceforge.net
Sun, 04 Aug 2002 10:29:54 -0700


Update of /cvsroot/python/python/dist/src/Parser
In directory usw-pr-cvs1:/tmp/cvs-serv1805/Parser

Modified Files:
	parsetok.c tokenizer.c tokenizer.h 
Added Files:
	tokenizer_pgen.c 
Log Message:
Patch #534304: Implement phase 1 of PEP 263.


--- NEW FILE: tokenizer_pgen.c ---
#define PGEN
#include "tokenizer.c"

Index: parsetok.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/parsetok.c,v
retrieving revision 2.32
retrieving revision 2.33
diff -C2 -d -r2.32 -r2.33
*** parsetok.c	4 Aug 2002 06:26:49 -0000	2.32
--- parsetok.c	4 Aug 2002 17:29:52 -0000	2.33
***************
*** 9,12 ****
--- 9,13 ----
  #include "parsetok.h"
  #include "errcode.h"
+ #include "graminit.h"
  
  int Py_TabcheckFlag;
***************
*** 46,51 ****
  	}
  
  	if (Py_TabcheckFlag || Py_VerboseFlag) {
- 		tok->filename = filename ? filename : "<string>";
  		tok->altwarning = (tok->filename != NULL);
  		if (Py_TabcheckFlag >= 2)
--- 47,52 ----
  	}
  
+         tok->filename = filename ? filename : "<string>";
  	if (Py_TabcheckFlag || Py_VerboseFlag) {
  		tok->altwarning = (tok->filename != NULL);
  		if (Py_TabcheckFlag >= 2)
***************
*** 79,84 ****
  		return NULL;
  	}
  	if (Py_TabcheckFlag || Py_VerboseFlag) {
- 		tok->filename = filename;
  		tok->altwarning = (filename != NULL);
  		if (Py_TabcheckFlag >= 2)
--- 80,85 ----
  		return NULL;
  	}
+ 	tok->filename = filename;
  	if (Py_TabcheckFlag || Py_VerboseFlag) {
  		tok->altwarning = (filename != NULL);
  		if (Py_TabcheckFlag >= 2)
***************
*** 186,189 ****
--- 187,197 ----
  			}
  		}
+ 	} else if (tok->encoding != NULL) {
+ 		node* r = PyNode_New(encoding_decl);
+ 		r->n_str = tok->encoding;
+ 		r->n_nchildren = 1;
+ 		r->n_child = n;
+ 		tok->encoding = NULL;
+ 		n = r;
  	}
  

Index: tokenizer.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.c,v
retrieving revision 2.54
retrieving revision 2.55
diff -C2 -d -r2.54 -r2.55
*** tokenizer.c	14 Apr 2002 20:12:41 -0000	2.54
--- tokenizer.c	4 Aug 2002 17:29:52 -0000	2.55
***************
*** 6,13 ****
--- 6,22 ----
  
  #include <ctype.h>
+ #include <assert.h>
  
  #include "tokenizer.h"
  #include "errcode.h"
  
+ #ifndef PGEN
+ #include "unicodeobject.h"
+ #include "stringobject.h"
+ #include "fileobject.h"
+ #include "codecs.h"
+ #include "abstract.h"
+ #endif /* PGEN */
+ 
  extern char *PyOS_Readline(char *);
  /* Return malloc'ed string including trailing \n;
***************
*** 115,121 ****
--- 124,537 ----
  	tok->alttabsize = 1;
  	tok->altindstack[0] = 0;
+ 	tok->decoding_state = 0;
+ 	tok->decoding_erred = 0;
+ 	tok->read_coding_spec = 0;
+ 	tok->issued_encoding_warning = 0;
+ 	tok->encoding = NULL;
+ 	tok->decoding_readline = NULL;
+ 	tok->decoding_buffer = NULL;
  	return tok;
  }
  
+ #ifdef PGEN
+ 
+ static char *
+ decoding_fgets(char *s, int size, struct tok_state *tok)
+ {
+ 	return fgets(s, size, tok->fp);
+ }
+ 
+ static int
+ decoding_feof(struct tok_state *tok)
+ {
+ 	return feof(tok->fp);
+ }
+ 
+ static const char *
+ decode_str(const char *str, struct tok_state *tok)
+ {
+ 	return str;
+ }
+ 
+ #else /* PGEN */
+ 
+ static char *
+ error_ret(struct tok_state *tok) /* XXX */
+ {
+ 	tok->decoding_erred = 1;
+ 	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
+ 		PyMem_DEL(tok->buf);
+ 	tok->buf = NULL;
+ 	return NULL;		/* as if it were EOF */
+ }
+ 
+ static char *
+ new_string(const char *s, int len)
+ {
+ 	char* result = PyMem_NEW(char, len + 1);
+ 	if (result != NULL) {
+ 		memcpy(result, s, len);
+ 		result[len] = '\0';
+ 	}
+ 	return result;
+ }
+ 
+ static char *
+ get_normal_name(char *s)	/* for utf-8 and latin-1 */
+ {
+ 	char buf[13];
+ 	int i;
+ 	for (i = 0; i < 12; i++) {
+ 		int c = s[i];
+ 		if (c == '\0') break;
+ 		else if (c == '_') buf[i] = '-';
+ 		else buf[i] = tolower(c);
+ 	}
+ 	buf[i] = '\0';
+ 	if (strcmp(buf, "utf-8") == 0 ||
+ 	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
+ 	else if (strcmp(buf, "latin-1") == 0 ||
+ 		 strcmp(buf, "iso-8859-1") == 0 ||
+ 		 strcmp(buf, "iso-latin-1") == 0 ||
+ 		 strncmp(buf, "latin-1-", 8) == 0 ||
+ 		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
+ 		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
+ 	else return s;
+ }
+ 
+ /* Return the coding spec in S, or NULL if none is found.  */
+ 
+ static char *
+ get_coding_spec(const char *s, int size)
+ {
+ 	int i;
+ 	for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
+ 		const char* t = s + i;
+ 		if (strncmp(t, "coding", 6) == 0) {
+ 			const char* begin = NULL;
+ 			t += 6;
+ 			if (t[0] != ':' && t[0] != '=')
+ 				continue;
+ 			do {
+ 				t++;
+ 			} while (t[0] == '\x20' || t[0] == '\t');
+ 
+ 			begin = t;
+ 			while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
+ 			       t[0] == '.')
+ 				t++;
+ 
+ 			if (begin < t) {
+ 				char* r = new_string(begin, t - begin);
+ 				char* q = get_normal_name(r);
+ 				if (r != q) {
+ 					assert(strlen(r) >= strlen(q));
+ 					strcpy(r, q);
+ 				}
+ 				return r;
+ 			}
+ 		}
+ 	}
+ 	return NULL;
+ }
+ 
+ /* Check whether the line contains a coding spec. If it does,
+    invoke the set_readline function for the new encoding.
+    This function receives the tok_state and the new encoding.
+    Return 1 on success, 0 on failure.  */
+ 
+ static int
+ check_coding_spec(const char* line, int size, struct tok_state *tok,
+ 		  int set_readline(struct tok_state *, const char *))
+ {
+ 	int r = 1;
+ 	char* cs = get_coding_spec(line, size);
+ 	if (cs != NULL) {
+ 		tok->read_coding_spec = 1;
+ 		if (tok->encoding == NULL) {
+ 			assert(tok->decoding_state == 1); /* raw */
+ 			if (strcmp(cs, "utf-8") == 0 ||
+ 			    strcmp(cs, "iso-8859-1") == 0) {
+ 				tok->encoding = cs;
+ 			} else {
+ 				r = set_readline(tok, cs);
+ 				if (r) {
+ 					tok->encoding = cs;
+ 					tok->decoding_state = -1;
+ 				}
+ 			}
+ 		} else {	/* then, compare cs with BOM */
+ 			r = (strcmp(tok->encoding, cs) == 0);
+ 			PyMem_DEL(cs);
+ 		}
+ 	}
+ 	return r;
+ }
+ 
+ /* See whether the file starts with a BOM. If it does,
+    invoke the set_readline function with the new encoding.
+    Return 1 on success, 0 on failure.  */
+ 
+ static int
+ check_bom(int get_char(struct tok_state *),
+ 	  void unget_char(int, struct tok_state *),
+ 	  int set_readline(struct tok_state *, const char *),
+ 	  struct tok_state *tok)
+ {
+ 	int ch = get_char(tok);
+ 	tok->decoding_state = 1;
+ 	if (ch == EOF) {
+ 		return 1;
+ 	} else if (ch == 0xEF) {
+ 		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
+ 		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
+ #if 0
+ 	/* Disable support for UTF-16 BOMs until a decision
+ 	   is made whether this needs to be supported.  */
+ 	} else if (ch == 0xFE) {
+ 		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
+ 		if (!set_readline(tok, "utf-16-be")) return 0;
+ 		tok->decoding_state = -1;
+ 	} else if (ch == 0xFF) {
+ 		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
+ 		if (!set_readline(tok, "utf-16-le")) return 0;
+ 		tok->decoding_state = -1;
+ #endif
+ 	} else {
+ 		unget_char(ch, tok);
+ 		return 1;
+ 	}
+ 	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
+ 	return 1;
+   NON_BOM:
+ 	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
+ 	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
+ 	return 1;
+ }
+ 
+ /* Read a line of text from TOK into S, using the stream in TOK.
+    Return NULL on failure, else S.  */
+ 
+ static char *
+ fp_readl(char *s, int size, struct tok_state *tok)
+ {
+ 	PyObject* utf8;
+ 	PyObject* buf = tok->decoding_buffer;
+ 	if (buf == NULL) {
+ 		buf = PyObject_CallObject(tok->decoding_readline, NULL);
+ 		if (buf == NULL) return error_ret(tok);
+ 	} else {
+ 		tok->decoding_buffer = NULL;
+ 	}
+ 	utf8 = PyUnicode_AsUTF8String(buf);
+ 	Py_DECREF(buf);
+ 	if (utf8 == NULL) return error_ret(tok);
+ 	else {
+ 		const char* str = PyString_AsString(utf8);
+ 		assert(strlen(str) < size); /* XXX */
+ 		strcpy(s, str);
+ 		Py_DECREF(utf8);
+ 		if (s[0] == '\0') return NULL; /* EOF */
+ 		return s;
+ 	}
+ }
+ 
+ /* Set the readline function for TOK to a StreamReader's
+    readline function. The StreamReader is named ENC.
+ 
+    This function is called from check_bom and check_coding_spec.
+ 
+    ENC is usually identical to the future value of tok->encoding,
+    except for the (currently unsupported) case of UTF-16.
+ 
+    Return 1 on success, 0 on failure. */
+ 
+ static int
+ fp_setreadl(struct tok_state *tok, const char* enc)
+ {
+ 	PyObject *reader, *stream, *readline;
+ 
+ 	stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
+ 	if (stream == NULL) return 0;
+ 
+ 	reader = PyCodec_StreamReader(enc, stream, NULL);
+ 	Py_DECREF(stream);
+ 	if (reader == NULL) return 0;
+ 
+ 	readline = PyObject_GetAttrString(reader, "readline");
+ 	Py_DECREF(reader);
+ 	if (readline == NULL) return 0;
+ 
+ 	tok->decoding_readline = readline;
+ 	return 1;
+ }
+ 
+ /* Fetch the next byte from TOK. */
+ 
+ static int fp_getc(struct tok_state *tok) {
+ 	return getc(tok->fp);
+ }
+ 
+ /* Unfetch the last byte back into TOK.  */
+ 
+ static void fp_ungetc(int c, struct tok_state *tok) {
+ 	ungetc(c, tok->fp);
+ }
+ 
+ /* Read a line of input from TOK. Determine encoding
+    if necessary.  */
+ 
+ static char *
+ decoding_fgets(char *s, int size, struct tok_state *tok)
+ {
+ 	char *line;
+ 	int warn = 0, badchar = 0;
+ 	for (;;)
+ 		if (tok->decoding_state < 0) {
+ 			/* We already have a codec associated with
+ 			   this input. */
+ 			line = fp_readl(s, size, tok);
+ 			break;
+ 		} else if (tok->decoding_state > 0) {
+ 			/* We want a 'raw' read. */
+ 			line = Py_UniversalNewlineFgets(s, size, 
+ 							tok->fp, NULL);
+ 			warn = 1;
+ 			break;
+ 		} else {
+ 			/* We have not yet determined the encoding.
+ 			   If an encoding is found, use the file-pointer
+ 			   reader functions from now on. */
+ 			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
+ 				return error_ret(tok);
+ 			assert(tok->decoding_state != 0);
+ 		}
+ 	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
+ 		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
+ 			return error_ret(tok);
+ 		}
+ 	}
+ #ifndef PGEN
+ 	if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
+ 		unsigned char *c;
+ 		for (c = line; *c; c++)
+ 			if (*c > 127) {
+ 				badchar = *c;
+ 				break;
+ 			}
+ 	}
+ 	if (badchar) {
+ 		char buf[200];
+ 		sprintf(buf, "Non-ASCII character '\\x%.2x', "
+ 			"but no declared encoding", badchar);
+ 		PyErr_WarnExplicit(PyExc_DeprecationWarning,
+ 				   buf, tok->filename, tok->lineno, 
+ 				   NULL, NULL);
+ 		tok->issued_encoding_warning = 1;
+ 	}
+ #endif
+ 	return line;
+ }
+ 
+ static int
+ decoding_feof(struct tok_state *tok)
+ {
+ 	if (tok->decoding_state >= 0) {
+ 		return feof(tok->fp);
+ 	} else {
+ 		PyObject* buf = tok->decoding_buffer;
+ 		if (buf == NULL) {
+ 			buf = PyObject_CallObject(tok->decoding_readline, NULL);
+ 			if (buf == NULL) {
+ 				error_ret(tok);
+ 				return 1;
+ 			} else {
+ 				tok->decoding_buffer = buf;
+ 			}
+ 		}
+ 		return PyObject_Length(buf) == 0;
+ 	}
+ }
+ 
+ /* Fetch a byte from TOK, using the string buffer. */
+ 
+ static int buf_getc(struct tok_state *tok) {
+ 	return *tok->str++;
+ }
+ 
+ /* Unfetch a byte from TOK, using the string buffer. */
+ 
+ static void buf_ungetc(int c, struct tok_state *tok) {
+ 	tok->str--;
+ 	assert(*tok->str == c);	/* tok->cur may point to read-only segment */
+ }
+ 
+ /* Set the readline function for TOK to ENC. For the string-based
+    tokenizer, this means to just record the encoding. */
+ 
+ static int buf_setreadl(struct tok_state *tok, const char* enc) {
+ 	tok->enc = enc;
+ 	return 1;
+ }
+ 
+ /* Return a UTF-8 encoding Python string object from the
+    C byte string STR, which is encoded with ENC. */
+ 
+ static PyObject *
+ translate_into_utf8(const char* str, const char* enc) {
+ 	PyObject *utf8;
+ 	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
+ 	if (buf == NULL)
+ 		return NULL;
+ 	utf8 = PyUnicode_AsUTF8String(buf);
+ 	Py_DECREF(buf);
+ 	return utf8;
+ }
+ 
+ /* Decode a byte string STR for use as the buffer of TOK.
+    Look for encoding declarations inside STR, and record them
+    inside TOK.  */
+ 
+ static const char *
+ decode_str(const char *str, struct tok_state *tok)
+ {
+ 	PyObject* utf8 = NULL;
+ 	const char *s;
+ 	int lineno = 0;
+ 	tok->enc = NULL;
+ 	tok->str = str;
+ 	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
+ 		return NULL;
+ 	str = tok->str;		/* string after BOM if any */
+ 	assert(r);
+ 	if (tok->enc != NULL) {
+ 		utf8 = translate_into_utf8(str, tok->enc);
+ 		if (utf8 == NULL)
+ 			return NULL;
+ 		str = PyString_AsString(utf8);
+ 	}
+ 	for (s = str;; s++) {
+ 		if (*s == '\0') break;
+ 		else if (*s == '\n') {
+ 			lineno++;
+ 			if (lineno == 2) break;
+ 		}
+ 	}
+ 	tok->enc = NULL;
+ 	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
+ 		return NULL;
+ 	if (tok->enc != NULL) {
+ 		assert(utf8 == NULL);
+ 		utf8 = translate_into_utf8(str, tok->enc);
+ 		if (utf8 == NULL)
+ 			return NULL;
+ 		str = PyString_AsString(utf8);
+ 	}
+ 	assert(tok->decoding_buffer == NULL);
+ 	tok->decoding_buffer = utf8; /* CAUTION */
+ 	return str;
+ }
+ 
+ #endif /* PGEN */
  
  /* Set up tokenizer for string */
***************
*** 127,130 ****
--- 543,549 ----
  	if (tok == NULL)
  		return NULL;
+ 	str = (char *)decode_str(str, tok);
+ 	if (str == NULL)
+ 		return NULL;
  	tok->buf = tok->cur = tok->end = tok->inp = str;
  	return tok;
***************
*** 158,161 ****
--- 577,584 ----
  PyTokenizer_Free(struct tok_state *tok)
  {
+ 	if (tok->encoding != NULL)
+ 		PyMem_DEL(tok->encoding);
+ 	Py_XDECREF(tok->decoding_readline);
+ 	Py_XDECREF(tok->decoding_buffer);
  	if (tok->fp != NULL && tok->buf != NULL)
  		PyMem_DEL(tok->buf);
***************
*** 247,252 ****
  					tok->end = tok->buf + BUFSIZ;
  				}
! 				if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
! 					  tok->fp, NULL) == NULL) {
  					tok->done = E_EOF;
  					done = 1;
--- 670,675 ----
  					tok->end = tok->buf + BUFSIZ;
  				}
! 				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
! 					  tok) == NULL) {
  					tok->done = E_EOF;
  					done = 1;
***************
*** 260,264 ****
  			else {
  				cur = tok->cur - tok->buf;
! 				if (feof(tok->fp)) {
  					tok->done = E_EOF;
  					done = 1;
--- 683,687 ----
  			else {
  				cur = tok->cur - tok->buf;
! 				if (decoding_feof(tok)) {
  					tok->done = E_EOF;
  					done = 1;
***************
*** 286,292 ****
  				tok->start = curstart < 0 ? NULL :
  					     tok->buf + curstart;
! 				if (Py_UniversalNewlineFgets(tok->inp,
  					       (int)(tok->end - tok->inp),
! 					       tok->fp, NULL) == NULL) {
  					/* Last line does not end in \n,
  					   fake one */
--- 709,715 ----
  				tok->start = curstart < 0 ? NULL :
  					     tok->buf + curstart;
! 				if (decoding_fgets(tok->inp,
  					       (int)(tok->end - tok->inp),
! 					       tok) == NULL) {
  					/* Last line does not end in \n,
  					   fake one */
***************
*** 507,513 ****
  /* Get next token, after space stripping etc. */
  
! int
! PyTokenizer_Get(register struct tok_state *tok, char **p_start,
! 		char **p_end)
  {
  	register int c;
--- 930,935 ----
  /* Get next token, after space stripping etc. */
  
! static int
! tok_get(register struct tok_state *tok, char **p_start, char **p_end)
  {
  	register int c;
***************
*** 916,919 ****
--- 1338,1351 ----
  }
  
+ int
+ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
+ {
+ 	int result = tok_get(tok, p_start, p_end);
+ 	if (tok->decoding_erred) {
+ 		result = ERRORTOKEN;
+ 		tok->done = E_DECODE;
+ 	}
+ 	return result;
+ }
  
  #ifdef Py_DEBUG

Index: tokenizer.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.h,v
retrieving revision 2.16
retrieving revision 2.17
diff -C2 -d -r2.16 -r2.17
*** tokenizer.h	1 Sep 2000 23:29:28 -0000	2.16
--- tokenizer.h	4 Aug 2002 17:29:52 -0000	2.17
***************
*** 5,8 ****
--- 5,9 ----
  #endif
  
+ #include "object.h"
  
  /* Tokenizer interface */
***************
*** 39,42 ****
--- 40,53 ----
  	int alttabsize;	/* Alternate tab spacing */
  	int altindstack[MAXINDENT];	/* Stack of alternate indents */
+ 	/* Stuff for PEP 0263 */
+ 	int decoding_state;	/* -1:decoding, 0:init, 1:raw */
+ 	int decoding_erred;	/* whether erred in decoding  */
+ 	int read_coding_spec;	/* whether 'coding:...' has been read  */
+ 	int issued_encoding_warning; /* whether non-ASCII warning was issued */
+ 	char *encoding;
+ 	PyObject *decoding_readline; /* codecs.open(...).readline */
+ 	PyObject *decoding_buffer;
+ 	const char* enc;
+ 	const char* str;
  };