ISLE: lexer simplifications (#9108)

* ISLE: reduce allocations when lexing integers Instead of creating a temporary `Vec<u8>`, use a slice of the original underlying `buf`, and only allocate a temporary `String` if it contains an `_`. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: don't `vec![]` macro in lexer tests `Vec` can be compared against arrays, since both deref to slices. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karlwfmeakin@gmail.com> * ISLE: create `Files` Centralize all file related arenas in `Files` struct. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: dont track line/col in `Pos` They are already tracked in `Files`, so no need to track them in `Pos` as well. This lets us simply the implementation of `Lexer::advance_pos` a bit. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: don't pass `Files` into every pass `Files` was being threaded through a lot of passes where it wasn't needed. It is only needed for reporting errors in `compile.rs` and for reporting line numbers when printing in `codegen.rs`. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: store `&str` in `Lexer` Store the text being lexed as `&str`, rather than `&[u8]`, so that substrings don't need to be rechecked for UTF-8 validity when lexing identifiers or integers. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: add `peek_byte` helper for lexer Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: tests for lexing integers Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> * ISLE: dont parse integers twice Instead of trying to parse an integer as an `i128`, and then as an `u128` if that fails, parse it only as a `u128` and then check for `i128::MIN`. Copyright (c) 2024, Arm Limited. Signed-off-by: Karl Meakin <karl.meakin@arm.com> --------- Signed-off-by: Karl Meakin <karl.meakin@arm.com> Signed-off-by: Karl Meakin <karlwfmeakin@gmail.com>
3 months ago · e0a907a94a
11 changed files with 377 additions and 294 deletions
--- a/cranelift/isle/fuzz/fuzz_targets/compile.rs
+++ b/cranelift/isle/fuzz/fuzz_targets/compile.rs
@ -1,11 +1,14 @@
 #![no_main]

+use std::sync::Arc;
+
+use cranelift_isle::files::Files;
 use libfuzzer_sys::fuzz_target;

-fuzz_target!(|s: &str| {
+fuzz_target!(|src: &str| {
    let _ = env_logger::try_init();

-    let lexer = cranelift_isle::lexer::Lexer::from_str(s, "fuzz-input.isle");
+    let lexer = cranelift_isle::lexer::Lexer::new(0, src);
    log::debug!("lexer = {:?}", lexer);
    let lexer = match lexer {
        Ok(l) => l,
@ -19,7 +22,12 @@ fuzz_target!(|s: &str| {
        Err(_) => return,
    };

-    let code = cranelift_isle::compile::compile(&defs, &Default::default());
+    let files = Arc::new(Files::from_names_and_contents([(
+        "fuzz-input.isle".to_string(),
+        src.to_string(),
+    )]));
+
+    let code = cranelift_isle::compile::compile(files, &defs, &Default::default());
    log::debug!("code = {:?}", code);
    let code = match code {
        Ok(c) => c,
--- a/cranelift/isle/isle/src/ast.rs
+++ b/cranelift/isle/isle/src/ast.rs
@ -4,15 +4,6 @@

 use crate::lexer::Pos;
 use crate::log;
-use std::sync::Arc;
-
-/// The parsed form of an ISLE file.
-#[derive(Clone, PartialEq, Eq, Debug)]
-pub struct Defs {
-    pub defs: Vec<Def>,
-    pub filenames: Vec<Arc<str>>,
-    pub file_texts: Vec<Arc<str>>,
-}

 /// One toplevel form in an ISLE file.
 #[derive(Clone, PartialEq, Eq, Debug)]
--- a/cranelift/isle/isle/src/codegen.rs
+++ b/cranelift/isle/isle/src/codegen.rs
@ -1,11 +1,13 @@
 //! Generate Rust code from a series of Sequences.

+use crate::files::Files;
 use crate::sema::{ExternalSig, ReturnKind, Sym, Term, TermEnv, TermId, Type, TypeEnv, TypeId};
 use crate::serialize::{Block, ControlFlow, EvalStep, MatchArm};
 use crate::stablemapset::StableSet;
 use crate::trie_again::{Binding, BindingId, Constraint, RuleSet};
 use std::fmt::Write;
 use std::slice::Iter;
+use std::sync::Arc;

 /// Options for code generation.
 #[derive(Clone, Debug, Default)]
@ -17,16 +19,18 @@ pub struct CodegenOptions {

 /// Emit Rust source code for the given type and term environments.
 pub fn codegen(
+    files: Arc<Files>,
    typeenv: &TypeEnv,
    termenv: &TermEnv,
    terms: &[(TermId, RuleSet)],
    options: &CodegenOptions,
 ) -> String {
-    Codegen::compile(typeenv, termenv, terms).generate_rust(options)
+    Codegen::compile(files, typeenv, termenv, terms).generate_rust(options)
 }

 #[derive(Clone, Debug)]
 struct Codegen<'a> {
+    files: Arc<Files>,
    typeenv: &'a TypeEnv,
    termenv: &'a TermEnv,
    terms: &'a [(TermId, RuleSet)],
@ -91,11 +95,13 @@ impl<'a, W: Write> BodyContext<'a, W> {

 impl<'a> Codegen<'a> {
    fn compile(
+        files: Arc<Files>,
        typeenv: &'a TypeEnv,
        termenv: &'a TermEnv,
        terms: &'a [(TermId, RuleSet)],
    ) -> Codegen<'a> {
        Codegen {
+            files,
            typeenv,
            termenv,
            terms,
@ -121,7 +127,7 @@ impl<'a> Codegen<'a> {
            "// Generated automatically from the instruction-selection DSL code in:",
        )
        .unwrap();
-        for file in &self.typeenv.filenames {
+        for file in &self.files.file_names {
            writeln!(code, "// - {file}").unwrap();
        }

@ -335,7 +341,7 @@ impl<L: Length, C> Length for ContextIterWrapper<L, C> {{
                        code,
                        "\n/// Internal type {}: defined at {}.",
                        name,
-                        pos.pretty_print_line(&self.typeenv.filenames[..])
+                        pos.pretty_print_line(&self.files)
                    )
                    .unwrap();

@ -454,7 +460,7 @@ impl<L: Length, C> Length for ContextIterWrapper<L, C> {{
                        term_name,
                        termdata
                            .decl_pos
-                            .pretty_print_line(&self.typeenv.filenames[..])
+                            .pretty_print_line(&self.files)
                    ),
                }
            };
@ -640,7 +646,7 @@ impl<L: Length, C> Length for ContextIterWrapper<L, C> {{
                                ctx.out,
                                "{}// Rule at {}.",
                                &ctx.indent,
-                                pos.pretty_print_line(&self.typeenv.filenames)
+                                pos.pretty_print_line(&self.files)
                            )?;
                            write!(ctx.out, "{}", &ctx.indent)?;
                            match ret_kind {
--- a/cranelift/isle/isle/src/compile.rs
+++ b/cranelift/isle/isle/src/compile.rs
@ -1,16 +1,34 @@
 //! Compilation process, from AST to Sema to Sequences of Insts.

 use std::path::Path;
+use std::sync::Arc;

 use crate::error::Errors;
-use crate::{ast, codegen, sema};
+use crate::files::Files;
+use crate::{ast, codegen, overlap, sema};

 /// Compile the given AST definitions into Rust source code.
-pub fn compile(defs: &ast::Defs, options: &codegen::CodegenOptions) -> Result<String, Errors> {
-    let mut typeenv = sema::TypeEnv::from_ast(defs)?;
-    let termenv = sema::TermEnv::from_ast(&mut typeenv, defs)?;
-    let terms = crate::overlap::check(&typeenv, &termenv)?;
-    Ok(codegen::codegen(&typeenv, &termenv, &terms, options))
+pub fn compile(
+    files: Arc<Files>,
+    defs: &[ast::Def],
+    options: &codegen::CodegenOptions,
+) -> Result<String, Errors> {
+    let mut type_env = match sema::TypeEnv::from_ast(defs) {
+        Ok(type_env) => type_env,
+        Err(errs) => return Err(Errors::new(errs, files)),
+    };
+    let term_env = match sema::TermEnv::from_ast(&mut type_env, defs) {
+        Ok(term_env) => term_env,
+        Err(errs) => return Err(Errors::new(errs, files)),
+    };
+    let terms = match overlap::check(&term_env) {
+        Ok(terms) => terms,
+        Err(errs) => return Err(Errors::new(errs, files)),
+    };
+
+    Ok(codegen::codegen(
+        files, &type_env, &term_env, &terms, options,
+    ))
 }

 /// Compile the given files into Rust source code.
@ -18,7 +36,30 @@ pub fn from_files<P: AsRef<Path>>(
    inputs: impl IntoIterator<Item = P>,
    options: &codegen::CodegenOptions,
 ) -> Result<String, Errors> {
-    let lexer = crate::lexer::Lexer::from_files(inputs)?;
-    let defs = crate::parser::parse(lexer)?;
-    compile(&defs, options)
+    let files = match Files::from_paths(inputs) {
+        Ok(files) => files,
+        Err((path, err)) => {
+            return Err(Errors::from_io(
+                err,
+                format!("cannot read file {}", path.display()),
+            ))
+        }
+    };
+
+    let files = Arc::new(files);
+
+    let mut defs = Vec::new();
+    for (file, src) in files.file_texts.iter().enumerate() {
+        let lexer = match crate::lexer::Lexer::new(file, src) {
+            Ok(lexer) => lexer,
+            Err(err) => return Err(Errors::new(vec![err], files)),
+        };
+
+        match crate::parser::parse(lexer) {
+            Ok(mut ds) => defs.append(&mut ds),
+            Err(err) => return Err(Errors::new(vec![err], files)),
+        }
+    }
+
+    compile(files, &defs, options)
 }
--- a/cranelift/isle/isle/src/error.rs
+++ b/cranelift/isle/isle/src/error.rs
@ -2,14 +2,13 @@

 use std::sync::Arc;

-use crate::lexer::Pos;
+use crate::{files::Files, lexer::Pos};

 /// A collection of errors from attempting to compile some ISLE source files.
 pub struct Errors {
    /// The individual errors.
    pub errors: Vec<Error>,
-    pub(crate) filenames: Vec<Arc<str>>,
-    pub(crate) file_texts: Vec<Arc<str>>,
+    pub(crate) files: Arc<Files>,
 }

 impl std::fmt::Debug for Errors {
@ -139,6 +138,11 @@ pub enum Error {
 }

 impl Errors {
+    /// Create new Errors
+    pub fn new(errors: Vec<Error>, files: Arc<Files>) -> Self {
+        Self { errors, files }
+    }
+
    /// Create `isle::Errors` from the given I/O error and context.
    pub fn from_io(error: std::io::Error, context: impl Into<String>) -> Self {
        Errors {
@ -146,8 +150,7 @@ impl Errors {
                error,
                context: context.into(),
            }],
-            filenames: Vec::new(),
-            file_texts: Vec::new(),
+            files: Arc::new(Files::default()),
        }
    }

@ -161,7 +164,12 @@ impl Errors {
        let w = termcolor::BufferWriter::stderr(termcolor::ColorChoice::Auto);
        let mut b = w.buffer();
        let mut files = codespan_reporting::files::SimpleFiles::new();
-        for (name, source) in self.filenames.iter().zip(self.file_texts.iter()) {
+        for (name, source) in self
+            .files
+            .file_names
+            .iter()
+            .zip(self.files.file_texts.iter())
+        {
            files.add(name, source);
        }
        for diagnostic in diagnostics {
@ -179,21 +187,16 @@ impl Errors {
        f: &mut std::fmt::Formatter,
        diagnostics: Vec<Diagnostic<usize>>,
    ) -> std::fmt::Result {
-        let line_ends: Vec<Vec<_>> = self
-            .file_texts
-            .iter()
-            .map(|text| text.match_indices('\n').map(|(i, _)| i + 1).collect())
-            .collect();
        let pos = |file_id: usize, offset| {
-            let ends = &line_ends[file_id];
-            let line0 = ends.partition_point(|&end| end <= offset);
-            let text = &self.file_texts[file_id];
+            let ends = self.files.file_line_map(file_id).unwrap();
+            let line0 = ends.line(offset);
+            let text = &self.files.file_texts[file_id];
            let start = line0.checked_sub(1).map_or(0, |prev| ends[prev]);
            let end = ends.get(line0).copied().unwrap_or(text.len());
            let col = offset - start + 1;
            format!(
                "{}:{}:{}: {}",
-                self.filenames[file_id],
+                self.files.file_names[file_id],
                line0 + 1,
                col,
                &text[start..end]
@ -243,8 +246,6 @@ impl Span {
            to: Pos {
                file: pos.file,
                offset: pos.offset + 1,
-                line: pos.line,
-                col: pos.col + 1,
            },
        }
    }
--- a/cranelift/isle/isle/src/files.rs
+++ b/cranelift/isle/isle/src/files.rs
@ -0,0 +1,133 @@
+#![allow(missing_docs)]
+
+use std::ops::Index;
+use std::path::{Path, PathBuf};
+
+#[derive(Default, Clone, PartialEq, Eq, Debug)]
+pub struct Files {
+    /// Arena of filenames from the input source.
+    ///
+    /// Indexed via `Pos::file`.
+    pub file_names: Vec<String>,
+
+    /// Arena of file source texts.
+    ///
+    /// Indexed via `Pos::file`.
+    pub file_texts: Vec<String>,
+
+    /// Arena of file line maps.
+    ///
+    /// Indexed via `Pos::file`.
+    pub file_line_maps: Vec<LineMap>,
+}
+
+#[derive(Default, Clone, PartialEq, Eq, Debug)]
+pub struct LineMap {
+    /// Mapping from line number to starting byte position.
+    line_ends: Vec<usize>,
+}
+
+impl Index<usize> for LineMap {
+    type Output = usize;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.line_ends[index]
+    }
+}
+
+impl LineMap {
+    pub fn from_str(text: &str) -> Self {
+        let line_ends = text.match_indices('\n').map(|(i, _)| i + 1).collect();
+        Self { line_ends }
+    }
+
+    /// Get the line on which `pos` occurs
+    pub fn line(&self, pos: usize) -> usize {
+        self.line_ends.partition_point(|&end| end <= pos)
+    }
+
+    /// Get the starting byte position of `line`.
+    pub fn get(&self, line: usize) -> Option<&usize> {
+        self.line_ends.get(line)
+    }
+}
+
+impl Files {
+    pub fn from_paths<P: AsRef<Path>>(
+        paths: impl IntoIterator<Item = P>,
+    ) -> Result<Self, (PathBuf, std::io::Error)> {
+        let mut file_names = Vec::new();
+        let mut file_texts = Vec::new();
+        let mut file_line_maps = Vec::new();
+
+        for path in paths {
+            let path = path.as_ref();
+            let contents =
+                std::fs::read_to_string(path).map_err(|err| (path.to_path_buf(), err))?;
+            let name = path.display().to_string();
+
+            file_line_maps.push(LineMap::from_str(&contents));
+            file_names.push(name);
+            file_texts.push(contents);
+        }
+
+        Ok(Self {
+            file_names,
+            file_texts,
+            file_line_maps,
+        })
+    }
+
+    pub fn from_names_and_contents(files: impl IntoIterator<Item = (String, String)>) -> Self {
+        let mut file_names = Vec::new();
+        let mut file_texts = Vec::new();
+        let mut file_line_maps = Vec::new();
+
+        for (name, contents) in files {
+            file_line_maps.push(LineMap::from_str(&contents));
+            file_names.push(name);
+            file_texts.push(contents);
+        }
+
+        Self {
+            file_names,
+            file_texts,
+            file_line_maps,
+        }
+    }
+
+    pub fn file_name(&self, file: usize) -> Option<&str> {
+        self.file_names.get(file).map(|x| x.as_str())
+    }
+
+    pub fn file_text(&self, file: usize) -> Option<&str> {
+        self.file_texts.get(file).map(|x| x.as_str())
+    }
+
+    pub fn file_line_map(&self, file: usize) -> Option<&LineMap> {
+        self.file_line_maps.get(file)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn line_map() {
+        let line_map = LineMap::from_str("");
+        assert_eq!(line_map.line_ends, &[]);
+        assert_eq!(line_map.line(0), 0);
+        assert_eq!(line_map.line(100), 0);
+
+        let line_map = LineMap::from_str("line 0");
+        assert_eq!(line_map.line_ends, &[]);
+        assert_eq!(line_map.line(0), 0);
+        assert_eq!(line_map.line(100), 0);
+
+        let line_map = LineMap::from_str("line 0\nline 1");
+        assert_eq!(line_map.line_ends, &[7]);
+        assert_eq!(line_map.line(0), 0);
+        assert_eq!(line_map.line(100), 1);
+    }
+}
--- a/cranelift/isle/isle/src/lexer.rs
+++ b/cranelift/isle/isle/src/lexer.rs
@ -1,29 +1,18 @@
 //! Lexer for the ISLE language.

-use crate::error::{Error, Errors, Span};
 use std::borrow::Cow;
-use std::path::{Path, PathBuf};
-use std::sync::Arc;

-type Result<T> = std::result::Result<T, Errors>;
+use crate::error::{Error, Span};
+use crate::files::Files;
+
+type Result<T> = std::result::Result<T, Error>;

 /// The lexer.
 ///
 /// Breaks source text up into a sequence of tokens (with source positions).
 #[derive(Clone, Debug)]
-pub struct Lexer<'a> {
-    /// Arena of filenames from the input source.
-    ///
-    /// Indexed via `Pos::file`.
-    pub filenames: Vec<Arc<str>>,
-
-    /// Arena of file source texts.
-    ///
-    /// Indexed via `Pos::file`.
-    pub file_texts: Vec<Arc<str>>,
-
-    file_starts: Vec<usize>,
-    buf: Cow<'a, [u8]>,
+pub struct Lexer<'src> {
+    src: &'src str,
    pos: Pos,
    lookahead: Option<(Pos, Token)>,
 }
@ -38,16 +27,21 @@ pub struct Pos {
    pub file: usize,
    /// This source position's byte offset in the file.
    pub offset: usize,
-    /// This source position's line number in the file.
-    pub line: usize,
-    /// This source position's column number in the file.
-    pub col: usize,
 }

 impl Pos {
+    /// Create a new `Pos`.
+    pub fn new(file: usize, offset: usize) -> Self {
+        Self { file, offset }
+    }
+
    /// Print this source position as `file.isle line 12`.
-    pub fn pretty_print_line(&self, filenames: &[Arc<str>]) -> String {
-        format!("{} line {}", filenames[self.file], self.line)
+    pub fn pretty_print_line(&self, files: &Files) -> String {
+        format!(
+            "{} line {}",
+            files.file_name(self.file).unwrap(),
+            files.file_line_map(self.file).unwrap().line(self.offset)
+        )
    }
 }

@ -66,69 +60,12 @@ pub enum Token {
    At,
 }

-impl<'a> Lexer<'a> {
-    /// Create a new lexer for the given source contents and filename.
-    pub fn from_str(s: &'a str, filename: &'a str) -> Result<Lexer<'a>> {
-        let mut l = Lexer {
-            filenames: vec![filename.into()],
-            file_texts: vec![s.into()],
-            file_starts: vec![0],
-            buf: Cow::Borrowed(s.as_bytes()),
-            pos: Pos {
-                file: 0,
-                offset: 0,
-                line: 1,
-                col: 0,
-            },
-            lookahead: None,
-        };
-        l.reload()?;
-        Ok(l)
-    }
-
-    /// Create a new lexer from the given files.
-    pub fn from_files<P>(file_paths: impl IntoIterator<Item = P>) -> Result<Lexer<'a>>
-    where
-        P: AsRef<Path>,
-    {
-        let mut files = vec![];
-        for f in file_paths.into_iter() {
-            let f = f.as_ref().to_path_buf();
-            let s = std::fs::read_to_string(f.as_path())
-                .map_err(|e| Errors::from_io(e, format!("failed to read file: {}", f.display())))?;
-            files.push((f, s));
-        }
-        Self::from_file_contents(files)
-    }
-
-    /// Create a new lexer from the given files and contents.
-    pub fn from_file_contents(files: Vec<(PathBuf, String)>) -> Result<Lexer<'a>> {
-        let mut filenames = Vec::<Arc<str>>::new();
-        let mut file_texts = Vec::<Arc<str>>::new();
-        for (f, content) in files.iter() {
-            filenames.push(f.display().to_string().into());
-
-            file_texts.push(content.as_str().into());
-        }
-        assert!(!filenames.is_empty());
-        let mut file_starts = vec![];
-        let mut buf = String::new();
-        for text in &file_texts {
-            file_starts.push(buf.len());
-            buf += text;
-            buf += "\n";
-        }
+impl<'src> Lexer<'src> {
+    /// Create a new lexer for the given source contents
+    pub fn new(file: usize, src: &'src str) -> Result<Lexer<'src>> {
        let mut l = Lexer {
-            filenames,
-            file_texts,
-            buf: Cow::Owned(buf.into_bytes()),
-            file_starts,
-            pos: Pos {
-                file: 0,
-                offset: 0,
-                line: 1,
-                col: 0,
-            },
+            src,
+            pos: Pos::new(file, 0),
            lookahead: None,
        };
        l.reload()?;
@ -137,39 +74,17 @@ impl<'a> Lexer<'a> {

    /// Get the lexer's current source position.
    pub fn pos(&self) -> Pos {
-        Pos {
-            file: self.pos.file,
-            offset: self.pos.offset - self.file_starts[self.pos.file],
-            line: self.pos.line,
-            col: self.pos.col,
-        }
+        self.pos
    }

    fn advance_pos(&mut self) {
-        self.pos.col += 1;
-        if self.buf[self.pos.offset] == b'\n' {
-            self.pos.line += 1;
-            self.pos.col = 0;
-        }
        self.pos.offset += 1;
-        if self.pos.file + 1 < self.file_starts.len() {
-            let next_start = self.file_starts[self.pos.file + 1];
-            if self.pos.offset >= next_start {
-                assert!(self.pos.offset == next_start);
-                self.pos.file += 1;
-                self.pos.line = 1;
-            }
-        }
    }

-    fn error(&self, pos: Pos, msg: impl Into<String>) -> Errors {
-        Errors {
-            errors: vec![Error::ParseError {
-                msg: msg.into(),
-                span: Span::new_single(pos),
-            }],
-            filenames: self.filenames.clone(),
-            file_texts: self.file_texts.clone(),
+    fn error(&self, pos: Pos, msg: impl Into<String>) -> Error {
+        Error::ParseError {
+            msg: msg.into(),
+            span: Span::new_single(pos),
        }
    }

@ -190,26 +105,26 @@ impl<'a> Lexer<'a> {
        }

        // Skip any whitespace and any comments.
-        while self.pos.offset < self.buf.len() {
-            if self.buf[self.pos.offset].is_ascii_whitespace() {
-                self.advance_pos();
-                continue;
-            }
-            if self.buf[self.pos.offset] == b';' {
-                while self.pos.offset < self.buf.len() && self.buf[self.pos.offset] != b'\n' {
-                    self.advance_pos();
+        while let Some(c) = self.peek_byte() {
+            match c {
+                c if c.is_ascii_whitespace() => self.advance_pos(),
+                b';' => {
+                    while let Some(c) = self.peek_byte() {
+                        match c {
+                            b'\n' => break,
+                            _ => self.advance_pos(),
+                        }
+                    }
                }
-                continue;
+                _ => break,
            }
-            break;
        }

-        if self.pos.offset == self.buf.len() {
+        let Some(c) = self.peek_byte() else {
            return Ok(None);
-        }
-
+        };
        let char_pos = self.pos();
-        match self.buf[self.pos.offset] {
+        match c {
            b'(' => {
                self.advance_pos();
                Ok(Some((char_pos, Token::LParen)))
@ -225,44 +140,43 @@ impl<'a> Lexer<'a> {
            c if is_sym_first_char(c) => {
                let start = self.pos.offset;
                let start_pos = self.pos();
-                while self.pos.offset < self.buf.len()
-                    && is_sym_other_char(self.buf[self.pos.offset])
-                {
-                    self.advance_pos();
+                while let Some(c) = self.peek_byte() {
+                    match c {
+                        c if is_sym_other_char(c) => self.advance_pos(),
+                        _ => break,
+                    }
                }
                let end = self.pos.offset;
-                let s = std::str::from_utf8(&self.buf[start..end])
-                    .expect("Only ASCII characters, should be UTF-8");
+                let s = &self.src[start..end];
                debug_assert!(!s.is_empty());
                Ok(Some((start_pos, Token::Symbol(s.to_string()))))
            }
            c @ (b'0'..=b'9' | b'-') => {
                let start_pos = self.pos();
-                let neg = if c == b'-' {
+                let mut neg = false;
+                if c == b'-' {
                    self.advance_pos();
-                    true
-                } else {
-                    false
-                };
+                    neg = true;
+                }

                let mut radix = 10;

                // Check for prefixed literals.
                match (
-                    self.buf.get(self.pos.offset),
-                    self.buf.get(self.pos.offset + 1),
+                    self.src.as_bytes().get(self.pos.offset),
+                    self.src.as_bytes().get(self.pos.offset + 1),
                ) {
-                    (Some(b'0'), Some(b'x')) | (Some(b'0'), Some(b'X')) => {
+                    (Some(b'0'), Some(b'x' | b'X')) => {
                        self.advance_pos();
                        self.advance_pos();
                        radix = 16;
                    }
-                    (Some(b'0'), Some(b'o')) => {
+                    (Some(b'0'), Some(b'o' | b'O')) => {
                        self.advance_pos();
                        self.advance_pos();
                        radix = 8;
                    }
-                    (Some(b'0'), Some(b'b')) => {
+                    (Some(b'0'), Some(b'b' | b'B')) => {
                        self.advance_pos();
                        self.advance_pos();
                        radix = 2;
@ -273,32 +187,37 @@ impl<'a> Lexer<'a> {
                // Find the range in the buffer for this integer literal. We'll
                // pass this range to `i64::from_str_radix` to do the actual
                // string-to-integer conversion.
-                let mut s = vec![];
-                while self.pos.offset < self.buf.len()
-                    && ((radix <= 10 && self.buf[self.pos.offset].is_ascii_digit())
-                        || (radix == 16 && self.buf[self.pos.offset].is_ascii_hexdigit())
-                        || self.buf[self.pos.offset] == b'_')
-                {
-                    if self.buf[self.pos.offset] != b'_' {
-                        s.push(self.buf[self.pos.offset]);
+                let start = self.pos.offset;
+                while let Some(c) = self.peek_byte() {
+                    match c {
+                        b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_' => self.advance_pos(),
+                        _ => break,
                    }
-                    self.advance_pos();
                }
-                let s_utf8 = std::str::from_utf8(&s[..]).unwrap();
+                let end = self.pos.offset;
+                let s = &self.src[start..end];
+                let s = if s.contains('_') {
+                    Cow::Owned(s.replace('_', ""))
+                } else {
+                    Cow::Borrowed(s)
+                };

                // Support either signed range (-2^127..2^127) or
                // unsigned range (0..2^128).
-                let num = i128::from_str_radix(s_utf8, radix)
-                    .or_else(|_| u128::from_str_radix(s_utf8, radix).map(|val| val as i128))
-                    .map_err(|e| self.error(start_pos, e.to_string()))?;
-
-                let tok = if neg {
-                    Token::Int(num.checked_neg().ok_or_else(|| {
-                        self.error(start_pos, "integer literal cannot fit in i128")
-                    })?)
-                } else {
-                    Token::Int(num)
+                let num = match u128::from_str_radix(&s, radix) {
+                    Ok(num) => num,
+                    Err(err) => return Err(self.error(start_pos, err.to_string())),
+                };
+
+                let num = match (neg, num) {
+                    (true, 0x80000000000000000000000000000000) => {
+                        return Err(self.error(start_pos, "integer literal cannot fit in i128"))
+                    }
+                    (true, _) => -(num as i128),
+                    (false, _) => num as i128,
                };
+                let tok = Token::Int(num);
+
                Ok(Some((start_pos, tok)))
            }
            c => Err(self.error(self.pos, format!("Unexpected character '{c}'"))),
@ -313,7 +232,7 @@ impl<'a> Lexer<'a> {
    }

    fn reload(&mut self) -> Result<()> {
-        if self.lookahead.is_none() && self.pos.offset < self.buf.len() {
+        if self.lookahead.is_none() && self.pos.offset < self.src.len() {
            self.lookahead = self.next_token()?;
        }
        Ok(())
@ -328,6 +247,10 @@ impl<'a> Lexer<'a> {
    pub fn eof(&self) -> bool {
        self.lookahead.is_none()
    }
+
+    fn peek_byte(&self) -> Option<u8> {
+        self.src.as_bytes().get(self.pos.offset).copied()
+    }
 }

 impl Token {
@ -346,9 +269,10 @@ impl Token {
 mod test {
    use super::*;

-    fn lex(s: &str, file: &str) -> Vec<Token> {
+    #[track_caller]
+    fn lex(src: &str) -> Vec<Token> {
        let mut toks = vec![];
-        let mut lexer = Lexer::from_str(s, file).unwrap();
+        let mut lexer = Lexer::new(0, src).unwrap();
        while let Some((_, tok)) = lexer.next().unwrap() {
            toks.push(tok);
        }
@ -358,11 +282,8 @@ mod test {
    #[test]
    fn lexer_basic() {
        assert_eq!(
-            lex(
-                ";; comment\n; another\r\n   \t(one two three 23 -568  )\n",
-                "lexer_basic"
-            ),
-            vec![
+            lex(";; comment\n; another\r\n   \t(one two three 23 -568  )\n"),
+            [
                Token::LParen,
                Token::Symbol("one".to_string()),
                Token::Symbol("two".to_string()),
@ -376,22 +297,19 @@ mod test {

    #[test]
    fn ends_with_sym() {
-        assert_eq!(
-            lex("asdf", "ends_with_sym"),
-            vec![Token::Symbol("asdf".to_string()),]
-        );
+        assert_eq!(lex("asdf"), [Token::Symbol("asdf".to_string())]);
    }

    #[test]
    fn ends_with_num() {
-        assert_eq!(lex("23", "ends_with_num"), vec![Token::Int(23)],);
+        assert_eq!(lex("23"), [Token::Int(23)]);
    }

    #[test]
    fn weird_syms() {
        assert_eq!(
-            lex("(+ [] => !! _test!;comment\n)", "weird_syms"),
-            vec![
+            lex("(+ [] => !! _test!;comment\n)"),
+            [
                Token::LParen,
                Token::Symbol("+".to_string()),
                Token::Symbol("[]".to_string()),
@ -402,4 +320,24 @@ mod test {
            ]
        );
    }
+
+    #[test]
+    fn integers() {
+        assert_eq!(
+            lex("0 1 -1"),
+            [Token::Int(0), Token::Int(1), Token::Int(-1)]
+        );
+
+        assert_eq!(
+            lex("340_282_366_920_938_463_463_374_607_431_768_211_455"),
+            [Token::Int(-1)]
+        );
+
+        assert_eq!(
+            lex("170_141_183_460_469_231_731_687_303_715_884_105_727"),
+            [Token::Int(i128::MAX)]
+        );
+
+        assert!(Lexer::new(0, "-170_141_183_460_469_231_731_687_303_715_884_105_728").is_err())
+    }
 }
--- a/cranelift/isle/isle/src/lib.rs
+++ b/cranelift/isle/isle/src/lib.rs
@ -23,6 +23,7 @@ pub mod codegen;
 pub mod compile;
 pub mod disjointsets;
 pub mod error;
+pub mod files;
 pub mod lexer;
 mod log;
 pub mod overlap;
--- a/cranelift/isle/isle/src/overlap.rs
+++ b/cranelift/isle/isle/src/overlap.rs
@ -3,27 +3,20 @@
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};

-use crate::error::{self, Error, Span};
+use crate::error::{Error, Span};
 use crate::lexer::Pos;
-use crate::sema::{TermEnv, TermId, TermKind, TypeEnv};
+use crate::sema::{TermEnv, TermId, TermKind};
 use crate::trie_again;

 /// Check for overlap.
-pub fn check(
-    tyenv: &TypeEnv,
-    termenv: &TermEnv,
-) -> Result<Vec<(TermId, trie_again::RuleSet)>, error::Errors> {
+pub fn check(termenv: &TermEnv) -> Result<Vec<(TermId, trie_again::RuleSet)>, Vec<Error>> {
    let (terms, mut errors) = trie_again::build(termenv);
    errors.append(&mut check_overlaps(&terms, termenv).report());

    if errors.is_empty() {
        Ok(terms)
    } else {
-        Err(error::Errors {
-            errors,
-            filenames: tyenv.filenames.clone(),
-            file_texts: tyenv.file_texts.clone(),
-        })
+        Err(errors)
    }
 }

--- a/cranelift/isle/isle/src/parser.rs
+++ b/cranelift/isle/isle/src/parser.rs
@ -1,13 +1,13 @@
 //! Parser for ISLE language.

 use crate::ast::*;
-use crate::error::{Error, Errors, Span};
+use crate::error::{Error, Span};
 use crate::lexer::{Lexer, Pos, Token};

-type Result<T> = std::result::Result<T, Errors>;
+type Result<T> = std::result::Result<T, Error>;

 /// Parse the top-level ISLE definitions and return their AST.
-pub fn parse(lexer: Lexer) -> Result<Defs> {
+pub fn parse(lexer: Lexer) -> Result<Vec<Def>> {
    let parser = Parser::new(lexer);
    parser.parse_defs()
 }
@ -34,14 +34,10 @@ impl<'a> Parser<'a> {
        Parser { lexer }
    }

-    fn error(&self, pos: Pos, msg: String) -> Errors {
-        Errors {
-            errors: vec![Error::ParseError {
-                msg,
-                span: Span::new_single(pos),
-            }],
-            filenames: self.lexer.filenames.clone(),
-            file_texts: self.lexer.file_texts.clone(),
+    fn error(&self, pos: Pos, msg: String) -> Error {
+        Error::ParseError {
+            msg,
+            span: Span::new_single(pos),
        }
    }

@ -136,16 +132,12 @@ impl<'a> Parser<'a> {
        }
    }

-    fn parse_defs(mut self) -> Result<Defs> {
+    fn parse_defs(mut self) -> Result<Vec<Def>> {
        let mut defs = vec![];
        while !self.lexer.eof() {
            defs.push(self.parse_def()?);
        }
-        Ok(Defs {
-            defs,
-            filenames: self.lexer.filenames,
-            file_texts: self.lexer.file_texts,
-        })
+        Ok(defs)
    }

    fn parse_def(&mut self) -> Result<Def> {
--- a/cranelift/isle/isle/src/sema.rs
+++ b/cranelift/isle/isle/src/sema.rs
@ -22,7 +22,6 @@ use std::collections::hash_map::Entry;
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::sync::Arc;

 declare_id!(
    /// The id of an interned symbol.
@ -58,16 +57,6 @@ declare_id!(
 /// Keeps track of which symbols and rules have which types.
 #[derive(Debug)]
 pub struct TypeEnv {
-    /// Arena of input ISLE source filenames.
-    ///
-    /// We refer to these indirectly through the `Pos::file` indices.
-    pub filenames: Vec<Arc<str>>,
-
-    /// Arena of input ISLE source contents.
-    ///
-    /// We refer to these indirectly through the `Pos::file` indices.
-    pub file_texts: Vec<Arc<str>>,
-
    /// Arena of interned symbol names.
    ///
    /// Referred to indirectly via `Sym` indices.
@ -912,10 +901,8 @@ macro_rules! unwrap_or_continue {

 impl TypeEnv {
    /// Construct the type environment from the AST.
-    pub fn from_ast(defs: &ast::Defs) -> Result<TypeEnv, Errors> {
+    pub fn from_ast(defs: &[ast::Def]) -> Result<TypeEnv, Vec<Error>> {
        let mut tyenv = TypeEnv {
-            filenames: defs.filenames.clone(),
-            file_texts: defs.file_texts.clone(),
            syms: vec![],
            sym_map: StableMap::new(),
            types: vec![],
@ -926,7 +913,7 @@ impl TypeEnv {

        // Traverse defs, assigning type IDs to type names. We'll fill
        // in types on a second pass.
-        for def in &defs.defs {
+        for def in defs {
            match def {
                &ast::Def::Type(ref td) => {
                    let tid = TypeId(tyenv.type_map.len());
@ -954,7 +941,7 @@ impl TypeEnv {
        // Now lower AST nodes to type definitions, raising errors
        // where typenames of fields are undefined or field names are
        // duplicated.
-        for def in &defs.defs {
+        for def in defs {
            match def {
                &ast::Def::Type(ref td) => {
                    let tid = tyenv.types.len();
@ -967,7 +954,7 @@ impl TypeEnv {
        }

        // Now collect types for extern constants.
-        for def in &defs.defs {
+        for def in defs {
            if let &ast::Def::Extern(ast::Extern::Const {
                ref name,
                ref ty,
@ -991,15 +978,11 @@ impl TypeEnv {
        Ok(tyenv)
    }

-    fn return_errors(&mut self) -> Result<(), Errors> {
+    fn return_errors(&mut self) -> Result<(), Vec<Error>> {
        if self.errors.is_empty() {
            Ok(())
        } else {
-            Err(Errors {
-                errors: std::mem::take(&mut self.errors),
-                filenames: self.filenames.clone(),
-                file_texts: self.file_texts.clone(),
-            })
+            Err(std::mem::take(&mut self.errors))
        }
    }

@ -1169,7 +1152,7 @@ impl Bindings {

 impl TermEnv {
    /// Construct the term environment from the AST and the type environment.
-    pub fn from_ast(tyenv: &mut TypeEnv, defs: &ast::Defs) -> Result<TermEnv, Errors> {
+    pub fn from_ast(tyenv: &mut TypeEnv, defs: &[ast::Def]) -> Result<TermEnv, Vec<Error>> {
        let mut env = TermEnv {
            terms: vec![],
            term_map: StableMap::new(),
@ -1196,13 +1179,13 @@ impl TermEnv {
        Ok(env)
    }

-    fn collect_pragmas(&mut self, _: &ast::Defs) {
+    fn collect_pragmas(&mut self, _: &[ast::Def]) {
        // currently, no pragmas are defined, but the infrastructure is useful to keep around
        return;
    }

-    fn collect_term_sigs(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn collect_term_sigs(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            match def {
                &ast::Def::Decl(ref decl) => {
                    let name = tyenv.intern_mut(&decl.term);
@ -1315,8 +1298,8 @@ impl TermEnv {
        }
    }

-    fn collect_constructors(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn collect_constructors(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            log!("collect_constructors from def: {:?}", def);
            match def {
                &ast::Def::Rule(ref rule) => {
@ -1378,10 +1361,10 @@ impl TermEnv {
        }
    }

-    fn collect_extractor_templates(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
+    fn collect_extractor_templates(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
        let mut extractor_call_graph = BTreeMap::new();

-        for def in &defs.defs {
+        for def in defs {
            if let &ast::Def::Extractor(ref ext) = def {
                let term = match self.get_term_by_name(tyenv, &ext.term) {
                    Some(x) => x,
@ -1502,8 +1485,8 @@ impl TermEnv {
        }
    }

-    fn collect_converters(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn collect_converters(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            match def {
                &ast::Def::Converter(ast::Converter {
                    ref term,
@ -1565,8 +1548,8 @@ impl TermEnv {
        }
    }

-    fn collect_externs(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn collect_externs(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            match def {
                &ast::Def::Extern(ast::Extern::Constructor {
                    ref term,
@ -1688,8 +1671,8 @@ impl TermEnv {
        }
    }

-    fn collect_rules(&mut self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn collect_rules(&mut self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            match def {
                &ast::Def::Rule(ref rule) => {
                    let pos = rule.pos;
@ -1781,8 +1764,8 @@ impl TermEnv {
        }
    }

-    fn check_for_undefined_decls(&self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn check_for_undefined_decls(&self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            if let ast::Def::Decl(decl) = def {
                let term = self.get_term_by_name(tyenv, &decl.term).unwrap();
                let term = &self.terms[term.index()];
@ -1799,8 +1782,8 @@ impl TermEnv {
        }
    }

-    fn check_for_expr_terms_without_constructors(&self, tyenv: &mut TypeEnv, defs: &ast::Defs) {
-        for def in &defs.defs {
+    fn check_for_expr_terms_without_constructors(&self, tyenv: &mut TypeEnv, defs: &[ast::Def]) {
+        for def in defs {
            if let ast::Def::Rule(rule) = def {
                rule.expr.terms(&mut |pos, ident| {
                    let term = match self.get_term_by_name(tyenv, ident) {
@ -2410,7 +2393,7 @@ mod test {
            (type u32 (primitive u32))
            (type A extern (enum (B (f1 u32) (f2 u32)) (C (f1 u32))))
        ";
-        let ast = parse(Lexer::from_str(text, "file.isle").unwrap()).expect("should parse");
+        let ast = parse(Lexer::new(0, text).unwrap()).expect("should parse");
        let tyenv = TypeEnv::from_ast(&ast).expect("should not have type-definition errors");

        let sym_a = tyenv
@ -2448,8 +2431,6 @@ mod test {
                Pos {
                    file: 0,
                    offset: 19,
-                    line: 2,
-                    col: 18,
                },
            ),
            Type::Enum {
@ -2489,8 +2470,6 @@ mod test {
                pos: Pos {
                    file: 0,
                    offset: 58,
-                    line: 3,
-                    col: 18,
                },
            },
        ];