From 94fbe9711a1179b3c4d0eb2e9822787d90231719 Mon Sep 17 00:00:00 2001 From: Damien George Date: Wed, 30 Jul 2014 11:46:05 +0100 Subject: [PATCH] py: Change lexer stream API to return bytes not chars. Lexer is now 8-bit clean inside strings. --- py/lexer.c | 38 ++++++++++++++++++++------------------ py/lexer.h | 12 ++++++------ py/lexerstr.c | 6 +++--- py/lexerunix.c | 14 +++++++------- stmhal/lexerfatfs.c | 12 ++++++------ 5 files changed, 42 insertions(+), 40 deletions(-) diff --git a/py/lexer.c b/py/lexer.c index dab75153ac..ff137fbbb4 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -45,7 +45,7 @@ struct _mp_lexer_t { qstr source_name; // name of source void *stream_data; // data for stream - mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char + mp_lexer_stream_next_byte_t stream_next_byte; // stream callback to get next byte mp_lexer_stream_close_t stream_close; // stream callback to free unichar chr0, chr1, chr2; // current cached characters from source @@ -103,7 +103,7 @@ void mp_token_show(const mp_token_t *tok) { #define CUR_CHAR(lex) ((lex)->chr0) STATIC bool is_end(mp_lexer_t *lex) { - return lex->chr0 == MP_LEXER_CHAR_EOF; + return lex->chr0 == MP_LEXER_EOF; } STATIC bool is_physical_newline(mp_lexer_t *lex) { @@ -171,7 +171,7 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) { } STATIC void next_char(mp_lexer_t *lex) { - if (lex->chr0 == MP_LEXER_CHAR_EOF) { + if (lex->chr0 == MP_LEXER_EOF) { return; } @@ -200,10 +200,10 @@ STATIC void next_char(mp_lexer_t *lex) { for (; advance > 0; advance--) { lex->chr0 = lex->chr1; lex->chr1 = lex->chr2; - lex->chr2 = lex->stream_next_char(lex->stream_data); - if (lex->chr2 == MP_LEXER_CHAR_EOF) { + lex->chr2 = lex->stream_next_byte(lex->stream_data); + if (lex->chr2 == MP_LEXER_EOF) { // EOF - if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') { + if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') { lex->chr2 = '\n'; // insert newline at end of file } } @@ -491,8 +491,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs vstr_add_char(&lex->vstr, '\\'); } else { switch (c) { - case MP_LEXER_CHAR_EOF: break; // TODO a proper error message? - case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it + case MP_LEXER_EOF: break; // TODO a proper error message? + case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it case '\\': break; case '\'': break; case '"': break; @@ -546,7 +546,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs break; } } - if (c != MP_LEXER_CHAR_EOF) { + if (c != MP_LEXER_EOF) { if (c < 0x110000 && !is_bytes) { vstr_add_char(&lex->vstr, c); } else if (c < 0x100 && is_bytes) { @@ -556,7 +556,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs } } } else { - vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + // Add the "character" as a byte so that we remain 8-bit clean. + // This way, strings are parsed correctly whether or not they contain utf-8 chars. + vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); } } next_char(lex); @@ -728,7 +730,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs } } -mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) { +mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) { mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1); // check for memory allocation error @@ -741,7 +743,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_ lex->source_name = src_name; lex->stream_data = stream_data; - lex->stream_next_char = stream_next_char; + lex->stream_next_byte = stream_next_byte; lex->stream_close = stream_close; lex->line = 1; lex->column = 1; @@ -762,18 +764,18 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_ lex->indent_level[0] = 0; // preload characters - lex->chr0 = stream_next_char(stream_data); - lex->chr1 = stream_next_char(stream_data); - lex->chr2 = stream_next_char(stream_data); + lex->chr0 = stream_next_byte(stream_data); + lex->chr1 = stream_next_byte(stream_data); + lex->chr2 = stream_next_byte(stream_data); // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end - if (lex->chr0 == MP_LEXER_CHAR_EOF) { + if (lex->chr0 == MP_LEXER_EOF) { lex->chr0 = '\n'; - } else if (lex->chr1 == MP_LEXER_CHAR_EOF) { + } else if (lex->chr1 == MP_LEXER_EOF) { if (lex->chr0 != '\n' && lex->chr0 != '\r') { lex->chr1 = '\n'; } - } else if (lex->chr2 == MP_LEXER_CHAR_EOF) { + } else if (lex->chr2 == MP_LEXER_EOF) { if (lex->chr1 != '\n' && lex->chr1 != '\r') { lex->chr2 = '\n'; } diff --git a/py/lexer.h b/py/lexer.h index 3f5176be10..d70735f6d7 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -139,18 +139,18 @@ typedef struct _mp_token_t { mp_uint_t len; // (byte) length of string of token } mp_token_t; -// the next-char function must return the next character in the stream -// it must return MP_LEXER_CHAR_EOF if end of stream -// it can be called again after returning MP_LEXER_CHAR_EOF, and in that case must return MP_LEXER_CHAR_EOF -#define MP_LEXER_CHAR_EOF (-1) -typedef unichar (*mp_lexer_stream_next_char_t)(void*); +// the next-byte function must return the next byte in the stream +// it must return MP_LEXER_EOF if end of stream +// it can be called again after returning MP_LEXER_EOF, and in that case must return MP_LEXER_EOF +#define MP_LEXER_EOF (-1) +typedef mp_uint_t (*mp_lexer_stream_next_byte_t)(void*); typedef void (*mp_lexer_stream_close_t)(void*); typedef struct _mp_lexer_t mp_lexer_t; void mp_token_show(const mp_token_t *tok); -mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close); +mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close); mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len); void mp_lexer_free(mp_lexer_t *lex); diff --git a/py/lexerstr.c b/py/lexerstr.c index 7655e4be6d..a919dc7e14 100644 --- a/py/lexerstr.c +++ b/py/lexerstr.c @@ -36,11 +36,11 @@ typedef struct _mp_lexer_str_buf_t { const char *src_end; // end (exclusive) of source } mp_lexer_str_buf_t; -STATIC unichar str_buf_next_char(mp_lexer_str_buf_t *sb) { +STATIC mp_uint_t str_buf_next_byte(mp_lexer_str_buf_t *sb) { if (sb->src_cur < sb->src_end) { return *sb->src_cur++; } else { - return MP_LEXER_CHAR_EOF; + return MP_LEXER_EOF; } } @@ -57,5 +57,5 @@ mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t sb->src_beg = str; sb->src_cur = str; sb->src_end = str + len; - return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_char_t)str_buf_next_char, (mp_lexer_stream_close_t)str_buf_free); + return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_byte_t)str_buf_next_byte, (mp_lexer_stream_close_t)str_buf_free); } diff --git a/py/lexerunix.c b/py/lexerunix.c index 51bc915b22..9d669f2bfe 100644 --- a/py/lexerunix.c +++ b/py/lexerunix.c @@ -41,20 +41,20 @@ typedef struct _mp_lexer_file_buf_t { int fd; - char buf[20]; - uint len; - uint pos; + byte buf[20]; + mp_uint_t len; + mp_uint_t pos; } mp_lexer_file_buf_t; -STATIC unichar file_buf_next_char(mp_lexer_file_buf_t *fb) { +STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) { if (fb->pos >= fb->len) { if (fb->len == 0) { - return MP_LEXER_CHAR_EOF; + return MP_LEXER_EOF; } else { int n = read(fb->fd, fb->buf, sizeof(fb->buf)); if (n <= 0) { fb->len = 0; - return MP_LEXER_CHAR_EOF; + return MP_LEXER_EOF; } fb->len = n; fb->pos = 0; @@ -78,7 +78,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) { int n = read(fb->fd, fb->buf, sizeof(fb->buf)); fb->len = n; fb->pos = 0; - return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close); + return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close); } #endif // MICROPY_HELPER_LEXER_UNIX diff --git a/stmhal/lexerfatfs.c b/stmhal/lexerfatfs.c index c578b13af6..21e3a2007e 100644 --- a/stmhal/lexerfatfs.c +++ b/stmhal/lexerfatfs.c @@ -36,20 +36,20 @@ typedef struct _mp_lexer_file_buf_t { FIL fp; - char buf[20]; + byte buf[20]; uint16_t len; uint16_t pos; } mp_lexer_file_buf_t; -static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) { +STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) { if (fb->pos >= fb->len) { if (fb->len < sizeof(fb->buf)) { - return MP_LEXER_CHAR_EOF; + return MP_LEXER_EOF; } else { UINT n; f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n); if (n == 0) { - return MP_LEXER_CHAR_EOF; + return MP_LEXER_EOF; } fb->len = n; fb->pos = 0; @@ -58,7 +58,7 @@ static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) { return fb->buf[fb->pos++]; } -static void file_buf_close(mp_lexer_file_buf_t *fb) { +STATIC void file_buf_close(mp_lexer_file_buf_t *fb) { f_close(&fb->fp); m_del_obj(mp_lexer_file_buf_t, fb); } @@ -74,5 +74,5 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) { f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n); fb->len = n; fb->pos = 0; - return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close); + return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close); }