From 94fbe9711a1179b3c4d0eb2e9822787d90231719 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Wed, 30 Jul 2014 11:46:05 +0100
Subject: [PATCH] py: Change lexer stream API to return bytes not chars.

Lexer is now 8-bit clean inside strings.
---
 py/lexer.c          | 38 ++++++++++++++++++++------------------
 py/lexer.h          | 12 ++++++------
 py/lexerstr.c       |  6 +++---
 py/lexerunix.c      | 14 +++++++-------
 stmhal/lexerfatfs.c | 12 ++++++------
 5 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/py/lexer.c b/py/lexer.c
index dab75153ac..ff137fbbb4 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -45,7 +45,7 @@
 struct _mp_lexer_t {
     qstr source_name;           // name of source
     void *stream_data;          // data for stream
-    mp_lexer_stream_next_char_t stream_next_char;   // stream callback to get next char
+    mp_lexer_stream_next_byte_t stream_next_byte;   // stream callback to get next byte
     mp_lexer_stream_close_t stream_close;           // stream callback to free
 
     unichar chr0, chr1, chr2;   // current cached characters from source
@@ -103,7 +103,7 @@ void mp_token_show(const mp_token_t *tok) {
 #define CUR_CHAR(lex) ((lex)->chr0)
 
 STATIC bool is_end(mp_lexer_t *lex) {
-    return lex->chr0 == MP_LEXER_CHAR_EOF;
+    return lex->chr0 == MP_LEXER_EOF;
 }
 
 STATIC bool is_physical_newline(mp_lexer_t *lex) {
@@ -171,7 +171,7 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
 }
 
 STATIC void next_char(mp_lexer_t *lex) {
-    if (lex->chr0 == MP_LEXER_CHAR_EOF) {
+    if (lex->chr0 == MP_LEXER_EOF) {
         return;
     }
 
@@ -200,10 +200,10 @@ STATIC void next_char(mp_lexer_t *lex) {
     for (; advance > 0; advance--) {
         lex->chr0 = lex->chr1;
         lex->chr1 = lex->chr2;
-        lex->chr2 = lex->stream_next_char(lex->stream_data);
-        if (lex->chr2 == MP_LEXER_CHAR_EOF) {
+        lex->chr2 = lex->stream_next_byte(lex->stream_data);
+        if (lex->chr2 == MP_LEXER_EOF) {
             // EOF
-            if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
+            if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
                 lex->chr2 = '\n'; // insert newline at end of file
             }
         }
@@ -491,8 +491,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                         vstr_add_char(&lex->vstr, '\\');
                     } else {
                         switch (c) {
-                            case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
-                            case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
+                            case MP_LEXER_EOF: break; // TODO a proper error message?
+                            case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
                             case '\\': break;
                             case '\'': break;
                             case '"': break;
@@ -546,7 +546,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                                 break;
                         }
                     }
-                    if (c != MP_LEXER_CHAR_EOF) {
+                    if (c != MP_LEXER_EOF) {
                         if (c < 0x110000 && !is_bytes) {
                             vstr_add_char(&lex->vstr, c);
                         } else if (c < 0x100 && is_bytes) {
@@ -556,7 +556,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                         }
                     }
                 } else {
-                    vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+                    // Add the "character" as a byte so that we remain 8-bit clean.
+                    // This way, strings are parsed correctly whether or not they contain utf-8 chars.
+                    vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
                 }
             }
             next_char(lex);
@@ -728,7 +730,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
     }
 }
 
-mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
+mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
     mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
 
     // check for memory allocation error
@@ -741,7 +743,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
 
     lex->source_name = src_name;
     lex->stream_data = stream_data;
-    lex->stream_next_char = stream_next_char;
+    lex->stream_next_byte = stream_next_byte;
     lex->stream_close = stream_close;
     lex->line = 1;
     lex->column = 1;
@@ -762,18 +764,18 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
     lex->indent_level[0] = 0;
 
     // preload characters
-    lex->chr0 = stream_next_char(stream_data);
-    lex->chr1 = stream_next_char(stream_data);
-    lex->chr2 = stream_next_char(stream_data);
+    lex->chr0 = stream_next_byte(stream_data);
+    lex->chr1 = stream_next_byte(stream_data);
+    lex->chr2 = stream_next_byte(stream_data);
 
     // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
-    if (lex->chr0 == MP_LEXER_CHAR_EOF) {
+    if (lex->chr0 == MP_LEXER_EOF) {
         lex->chr0 = '\n';
-    } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
+    } else if (lex->chr1 == MP_LEXER_EOF) {
         if (lex->chr0 != '\n' && lex->chr0 != '\r') {
             lex->chr1 = '\n';
         }
-    } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
+    } else if (lex->chr2 == MP_LEXER_EOF) {
         if (lex->chr1 != '\n' && lex->chr1 != '\r') {
             lex->chr2 = '\n';
         }
diff --git a/py/lexer.h b/py/lexer.h
index 3f5176be10..d70735f6d7 100644
--- a/py/lexer.h
+++ b/py/lexer.h
@@ -139,18 +139,18 @@ typedef struct _mp_token_t {
     mp_uint_t len;              // (byte) length of string of token
 } mp_token_t;
 
-// the next-char function must return the next character in the stream
-// it must return MP_LEXER_CHAR_EOF if end of stream
-// it can be called again after returning MP_LEXER_CHAR_EOF, and in that case must return MP_LEXER_CHAR_EOF
-#define MP_LEXER_CHAR_EOF (-1)
-typedef unichar (*mp_lexer_stream_next_char_t)(void*);
+// the next-byte function must return the next byte in the stream
+// it must return MP_LEXER_EOF if end of stream
+// it can be called again after returning MP_LEXER_EOF, and in that case must return MP_LEXER_EOF
+#define MP_LEXER_EOF (-1)
+typedef mp_uint_t (*mp_lexer_stream_next_byte_t)(void*);
 typedef void (*mp_lexer_stream_close_t)(void*);
 
 typedef struct _mp_lexer_t mp_lexer_t;
 
 void mp_token_show(const mp_token_t *tok);
 
-mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close);
+mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close);
 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
 
 void mp_lexer_free(mp_lexer_t *lex);
diff --git a/py/lexerstr.c b/py/lexerstr.c
index 7655e4be6d..a919dc7e14 100644
--- a/py/lexerstr.c
+++ b/py/lexerstr.c
@@ -36,11 +36,11 @@ typedef struct _mp_lexer_str_buf_t {
     const char *src_end;        // end (exclusive) of source
 } mp_lexer_str_buf_t;
 
-STATIC unichar str_buf_next_char(mp_lexer_str_buf_t *sb) {
+STATIC mp_uint_t str_buf_next_byte(mp_lexer_str_buf_t *sb) {
     if (sb->src_cur < sb->src_end) {
         return *sb->src_cur++;
     } else {
-        return MP_LEXER_CHAR_EOF;
+        return MP_LEXER_EOF;
     }
 }
 
@@ -57,5 +57,5 @@ mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t
     sb->src_beg = str;
     sb->src_cur = str;
     sb->src_end = str + len;
-    return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_char_t)str_buf_next_char, (mp_lexer_stream_close_t)str_buf_free);
+    return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_byte_t)str_buf_next_byte, (mp_lexer_stream_close_t)str_buf_free);
 }
diff --git a/py/lexerunix.c b/py/lexerunix.c
index 51bc915b22..9d669f2bfe 100644
--- a/py/lexerunix.c
+++ b/py/lexerunix.c
@@ -41,20 +41,20 @@
 
 typedef struct _mp_lexer_file_buf_t {
     int fd;
-    char buf[20];
-    uint len;
-    uint pos;
+    byte buf[20];
+    mp_uint_t len;
+    mp_uint_t pos;
 } mp_lexer_file_buf_t;
 
-STATIC unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
+STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
     if (fb->pos >= fb->len) {
         if (fb->len == 0) {
-            return MP_LEXER_CHAR_EOF;
+            return MP_LEXER_EOF;
         } else {
             int n = read(fb->fd, fb->buf, sizeof(fb->buf));
             if (n <= 0) {
                 fb->len = 0;
-                return MP_LEXER_CHAR_EOF;
+                return MP_LEXER_EOF;
             }
             fb->len = n;
             fb->pos = 0;
@@ -78,7 +78,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
     int n = read(fb->fd, fb->buf, sizeof(fb->buf));
     fb->len = n;
     fb->pos = 0;
-    return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
+    return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
 }
 
 #endif // MICROPY_HELPER_LEXER_UNIX
diff --git a/stmhal/lexerfatfs.c b/stmhal/lexerfatfs.c
index c578b13af6..21e3a2007e 100644
--- a/stmhal/lexerfatfs.c
+++ b/stmhal/lexerfatfs.c
@@ -36,20 +36,20 @@
 
 typedef struct _mp_lexer_file_buf_t {
     FIL fp;
-    char buf[20];
+    byte buf[20];
     uint16_t len;
     uint16_t pos;
 } mp_lexer_file_buf_t;
 
-static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
+STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
     if (fb->pos >= fb->len) {
         if (fb->len < sizeof(fb->buf)) {
-            return MP_LEXER_CHAR_EOF;
+            return MP_LEXER_EOF;
         } else {
             UINT n;
             f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
             if (n == 0) {
-                return MP_LEXER_CHAR_EOF;
+                return MP_LEXER_EOF;
             }
             fb->len = n;
             fb->pos = 0;
@@ -58,7 +58,7 @@ static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
     return fb->buf[fb->pos++];
 }
 
-static void file_buf_close(mp_lexer_file_buf_t *fb) {
+STATIC void file_buf_close(mp_lexer_file_buf_t *fb) {
     f_close(&fb->fp);
     m_del_obj(mp_lexer_file_buf_t, fb);
 }
@@ -74,5 +74,5 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
     f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
     fb->len = n;
     fb->pos = 0;
-    return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
+    return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
 }