commit a66c2e05f9a4e4b69f472da064742bc6e793a9e5 Author: Stefan Bühler Date: Wed Jul 19 13:36:11 2023 +0200 wip diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..514e0ec --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,344 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "brownstone" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5839ee4f953e811bfdcf223f509cb2c6a3e1447959b0bff459405575bc17f22" +dependencies = [ + "arrayvec", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "dashmap" +version = "5.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" +dependencies = [ + "cfg-if", + "hashbrown 0.14.0", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "gc" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3edaac0f5832202ebc99520cb77c932248010c4645d20be1dc62d6579f5b3752" + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" + +[[package]] +name = "indent_write" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cfe9645a18782869361d9c8732246be7b410ad4e919d3609ebabdac00ba12c3" + +[[package]] +name = "internment" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "161079c3ad892faa215fcfcf3fd7a6a3c9288df2b06a2c2bad7fbfad4f01d69d" +dependencies = [ + "ahash", + "dashmap", + "hashbrown 0.12.3", + "once_cell", + "parking_lot", +] + +[[package]] +name = "joinery" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72167d68f5fce3b8655487b8038691a3c9984ee769590f93f2a631f4ad64e4f5" + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nix" +version = "0.1.0" +dependencies = [ + "derivative", + "gc", + "internment", + "nom", + "nom-supreme", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom-supreme" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd3ae6c901f1959588759ff51c95d24b491ecb9ff91aa9c2ef4acc5b1dcab27" +dependencies = [ + "brownstone", + "indent_write", + "joinery", + "memchr", + "nom", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fe8a65d69dd0808184ebb5f836ab526bb259db23c657efa38711b1072ee47f0" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "smallvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f463122 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "nix" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +gc = "0.4.1" +internment = { version="0.7.1", default-features=false, features=["arc"] } +nom = "7.1.3" +nom-supreme = "0.8.0" +derivative = "2.2.0" diff --git a/src/bin/nix.rs b/src/bin/nix.rs new file mode 100644 index 0000000..c7f7cdc --- /dev/null +++ b/src/bin/nix.rs @@ -0,0 +1,39 @@ +extern crate nix; + +fn main() { + let source = std::sync::Arc::new( + nix::parser::source::Source { + filename: "default.nix".into(), + content: r#"let requiredVersion = import ./lib/minver.nix; in + +if ! builtins ? nixVersion || builtins.compareVersions requiredVersion builtins.nixVersion == 1 then + + abort '' + + This version of Nixpkgs requires Nix >= ${requiredVersion}, please upgrade: + + - If you are running NixOS, `nixos-rebuild' can be used to upgrade your system. + + - Alternatively, with Nix > 2.0 `nix upgrade-nix' can be used to imperatively + upgrade Nix. You may use `nix-env --version' to check which version you have. + + - If you installed Nix using the install script (https://nixos.org/nix/install), + it is safe to upgrade by running it again: + + curl -L https://nixos.org/nix/install | sh + + For more information, please see the NixOS release notes at + https://nixos.org/nixos/manual or locally at + ${toString ./nixos/doc/manual/release-notes}. + + If you need further help, see https://nixos.org/nixos/support.html + '' + +else + + import ./pkgs/top-level/impure.nix +"#.into(), + }, + ); + println!("{:?}", nix::parser::token::TokenList::parse_file(source.span()).unwrap().1); +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..67c567f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod parser; diff --git a/src/parser/common.rs b/src/parser/common.rs new file mode 100644 index 0000000..710175b --- /dev/null +++ b/src/parser/common.rs @@ -0,0 +1,41 @@ +use super::source::Span; + +#[derive(Clone, Debug)] +pub enum Number { + Integer(i64), + Float(f64), +} + +#[derive(Clone)] +pub struct SpannedData { + pub data: T, + pub span: Span, +} + +impl std::fmt::Debug for SpannedData +where + T: std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.data.fmt(f) + } +} + +type InnerIdentifier = internment::ArcIntern; + +#[derive(Clone)] +pub struct Identifier(InnerIdentifier); + +impl Identifier { + pub fn from_ref(identifier: &str) -> Self { + Self(InnerIdentifier::from_ref(identifier)) + } +} + +impl std::fmt::Debug for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("`")?; + f.write_str(self.0.as_str())?; + f.write_str("`") + } +} diff --git a/src/parser/expression.rs b/src/parser/expression.rs new file mode 100644 index 0000000..896640d --- /dev/null +++ b/src/parser/expression.rs @@ -0,0 +1,110 @@ +use super::Identifier; + +use std::sync::Arc; + +#[derive(Clone, Debug)] +pub enum Number { + Integer(i64), + Float(f64), +} + +#[derive(Clone, Debug)] +pub enum Literal { + Number(Number), + String(String), +} + +#[derive(Clone, Debug)] +pub enum AttributeName { + // plain identifier + Literal(Identifier), + // quoted strings or ${...} expressions + Interpolated(Arc), +} + +#[derive(Clone, Debug)] +pub enum PathElement { + Fixed(String), + // ${...} + Expression(Arc), +} + +#[derive(Clone, Debug)] +// `inherit NAME1 NAME2;` +// `inherit (SET) NAME1 NAME2;` +pub struct Inherit { + pub from: Option>, + // quoted identifiers are ok, but not dynamic ones + pub names: Vec, +} + +#[derive(Clone, Debug)] +pub enum LetAssignment { + Assign { + // quoted identifier is ok, but not dynamic one + name: Identifier, + value: Arc, + }, + Inherit(Inherit), +} + +#[derive(Clone, Debug)] +pub struct LambdaSetParam { + pub names: Vec<(Identifier, Option>)>, + pub open: bool, // `...` +} + +#[derive(Clone, Debug)] +pub enum Expression { + Identifier(Identifier), + Literal(Arc), + InterpolateString(Vec>), + Path { + // base must include `/`, otherwise wouldn't be recognized as path + base: String, + interpolate: Vec, + }, + // `let (NAME = VALUE;)* in EVALUATE` + Let { + assignments: Vec<(Identifier, Arc)>, + evaluate: Arc, + }, + // `with SET; EVALUATE` + // attributes from with don't "shadow" attributes in scope from let/lambda/rec. + // but they do shadow attributes from "more distant" with statements. + With { + set: Arc, + evaluate: Arc, + }, + // `[ ... ]` + List { + elements: Vec>, + }, + // `{ ... }` + AttributeSet { + elements: Vec<(AttributeName, Arc)>, + inherits: Vec, + }, + // `rec ...` + RecursiveSet(Arc), + // `NAME: BODY + // `NAME@{...}: BODY + // `{...}: BODY + Lambda { + // quoting not allowed + name: Option, + set_params: LambdaSetParam, + body: Arc, + }, + // `if COND then TRUE_BRANCH else FALSE_BRANCH` + Conditional { + cond: Arc, + true_branch: Arc, + false_branch: Arc, + }, + // `assert ASSERTION; BODY` + Assert { + assertion: Arc, + body: Arc, + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..9cc0795 --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,8 @@ +mod common; +pub mod expression; +pub mod source; +pub mod token; + +pub use self::{ + common::{Number, SpannedData, Identifier}, +}; diff --git a/src/parser/source.rs b/src/parser/source.rs new file mode 100644 index 0000000..885fd24 --- /dev/null +++ b/src/parser/source.rs @@ -0,0 +1,314 @@ +use std::sync::Arc; + +#[derive(Clone, Debug)] +pub struct Source { + pub filename: String, + pub content: String, +} + +impl Source { + pub fn span<'a>(self: &'a Arc) -> SpanRef<'a> { + // Ensure offset/length can be stored in u32 for `Span` + assert!(self.content.len() <= u32::MAX as usize); + SpanRef { + source: self, + data: self.content.as_str(), + } + } +} + +#[derive(Clone)] +pub struct Span { + source: Arc, + start: u32, + length: u32, +} + +impl Span { + pub fn as_str(&self) -> &str { + &self.source.content[self.start as usize..][..self.length as usize] + } + + pub fn as_ref(&self) -> SpanRef<'_> { + SpanRef { + source: &self.source, + data: self.as_str(), + } + } +} + +impl nom::Slice> for Span { + fn slice(&self, range: std::ops::Range) -> Self { + assert!(range.start <= range.end); + assert!(range.end <= self.length as usize); + Self { + source: self.source.clone(), + start: self.start + range.start as u32, + length: (range.end - range.start) as u32, + } + } +} + +impl nom::Slice> for Span { + fn slice(&self, range: std::ops::RangeTo) -> Self { + assert!(range.end <= self.length as usize); + Self { + source: self.source.clone(), + start: self.start, + length: range.end as u32, + } + } +} + +impl nom::Slice> for Span { + fn slice(&self, range: std::ops::RangeFrom) -> Self { + assert!(range.start <= self.length as usize); + Self { + source: self.source.clone(), + start: self.start + range.start as u32, + length: self.length - range.start as u32, + } + } +} + +impl nom::Slice for Span { + fn slice(&self, _range: std::ops::RangeFull) -> Self { + self.clone() + } +} + +impl std::fmt::Debug for Span { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}:{}: {:?}", + self.source.filename, + self.start, + self.as_str(), + ) + } +} + +impl From> for Span { + fn from(span_ref: SpanRef<'_>) -> Self { + use nom::Offset; + let start = span_ref.source.content.as_str().offset(span_ref.data) as u32; + Self { + source: span_ref.source.clone(), + start, + length: span_ref.data.len() as u32, + } + } +} + +#[derive(Clone, Copy)] +pub struct SpanRef<'a> { + source: &'a Arc, + data: &'a str, +} + +impl std::fmt::Debug for SpanRef<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use nom::Offset; + write!( + f, + "{}:{}: {:?}", + self.source.filename, + self.source.content.as_str().offset(self.data), + self.data, + ) + } +} + +impl<'a> SpanRef<'a> { + pub fn as_str(&self) -> &'a str { + self.data + } + + fn locate(&self, data: &str) -> &'a str { + use nom::Offset; + &self.data[self.data.offset(data)..][..data.len()] + } + + fn lift_error(&self, e: nom::error::Error<&str>) -> E + where + E: nom::error::ParseError + { + let input = Self { data: self.locate(e.input), ..*self }; + E::from_error_kind(input, e.code) + } + + fn lift_result(&self, r: Result>>) -> Result> + where + E: nom::error::ParseError + { + match r { + Ok(v) => Ok(v), + Err(e) => Err(e.map(|e| self.lift_error(e))), + } + } + +} + +impl<'a> nom::AsBytes for SpanRef<'a> { + fn as_bytes(&self) -> &'a [u8] { + self.data.as_bytes() + } +} + +impl<'a, 'b> nom::Compare<&'b str> for SpanRef<'a> { + #[inline(always)] + fn compare(&self, t: &'b str) -> nom::CompareResult { + self.data.compare(t) + } + + fn compare_no_case(&self, t: &'b str) -> nom::CompareResult { + self.data.compare_no_case(t) + } +} + +impl nom::ExtendInto for SpanRef<'_> { + type Item = char; + type Extender = String; + + fn new_builder(&self) -> Self::Extender { + String::new() + } + + fn extend_into(&self, acc: &mut Self::Extender) { + acc.push_str(self.data); + } +} + +impl nom::FindSubstring<&str> for SpanRef<'_> { + fn find_substring(&self, substr: &str) -> Option { + self.data.find_substring(substr) + } +} + +impl nom::FindToken for SpanRef<'_> +where + for<'a> &'a str: nom::FindToken, +{ + fn find_token(&self, token: T) -> bool { + self.data.find_token(token) + } +} + +impl<'a> nom::InputIter for SpanRef<'a> { + type Item = char; + type Iter = std::str::CharIndices<'a>; + type IterElem = std::str::Chars<'a>; + + fn iter_indices(&self) -> Self::Iter { + self.data.iter_indices() + } + + fn iter_elements(&self) -> Self::IterElem { + self.data.iter_elements() + } + + fn position

(&self, predicate: P) -> Option + where + P: Fn(Self::Item) -> bool + { + self.data.position(predicate) + } + + fn slice_index(&self, count: usize) -> Result { + self.as_str().slice_index(count) + } +} + +impl nom::InputLength for SpanRef<'_> { + fn input_len(&self) -> usize { + self.data.len() as usize + } +} + +impl<'a> nom::InputTake for SpanRef<'a> { + fn take(&self, count: usize) -> Self { + let data = self.data.take(count); + Self { data, ..*self } + } + + fn take_split(&self, count: usize) -> (Self, Self) { + let (prefix, suffix) = self.data.take_split(count); + (Self { data: prefix, ..*self }, Self { data: suffix, ..*self }) + } +} + +impl nom::InputTakeAtPosition for SpanRef<'_> { + type Item = char; + + fn split_at_position>( + &self, + predicate: P + ) -> nom::IResult + where + P: Fn(Self::Item) -> bool + { + let (rem, data) = self.lift_result(self.data.split_at_position(predicate))?; + Ok((Self { data: rem, ..*self }, Self { data, ..*self } )) + } + + fn split_at_position1>( + &self, + predicate: P, + e: nom::error::ErrorKind + ) -> nom::IResult + where + P: Fn(Self::Item) -> bool + { + let (rem, data) = self.lift_result(self.data.split_at_position1(predicate, e))?; + Ok((Self { data: rem, ..*self }, Self { data, ..*self } )) + } + + fn split_at_position_complete>( + &self, + predicate: P + ) -> nom::IResult + where + P: Fn(Self::Item) -> bool + { + let (rem, data) = self.lift_result(self.data.split_at_position_complete(predicate))?; + Ok((Self { data: rem, ..*self }, Self { data, ..*self } )) + } + + fn split_at_position1_complete>( + &self, + predicate: P, + e: nom::error::ErrorKind + ) -> nom::IResult + where + P: Fn(Self::Item) -> bool + { + let (rem, data) = self.lift_result(self.data.split_at_position1_complete(predicate, e))?; + Ok((Self { data: rem, ..*self }, Self { data, ..*self } )) + } +} + +impl nom::ParseTo for SpanRef<'_> +where + for<'a> &'a str: nom::ParseTo, +{ + fn parse_to(&self) -> Option { + self.data.parse_to() + } +} + +impl nom::Offset for SpanRef<'_> { + fn offset(&self, second: &Self) -> usize { + self.as_str().offset(second.as_str()) + } +} + +impl nom::Slice for SpanRef<'_> +where + for<'a> &'a str: nom::Slice +{ + fn slice(&self, range: R) -> Self { + let data = self.data.slice(range); + Self { data, ..*self } + } +} diff --git a/src/parser/token/brackets.rs b/src/parser/token/brackets.rs new file mode 100644 index 0000000..12311af --- /dev/null +++ b/src/parser/token/brackets.rs @@ -0,0 +1,169 @@ +use super::{ + Token, + SpannedData, + SpanRef, + PResult, + TokenList, + IResultExt, + SpanExt, +}; + +fn parse_bracketed<'a, O, C, F, T>( + open_tag: &'static str, + close_tag: &'static str, + open: O, + close: C, + constructor: F, + span: SpanRef<'a>, +) -> PResult<'a, SpannedData> +where + F: FnOnce(SpannedData, TokenList, SpannedData) -> T, +{ + use nom::{Offset, Slice}; + + let (rem_span, open_span) = nom::bytes::complete::tag(open_tag)(span)?; + let (rem_span, inner) = TokenList::parse_expression(rem_span).unrecoverable()?; + let (rem_span, close_span) = nom::bytes::complete::tag(close_tag)(rem_span).unrecoverable()?; + let result = constructor(open_span.data(open), inner, close_span.data(close)); + let index = span.offset(&rem_span); + let bracket_span = span.slice(..index); + Ok((rem_span, bracket_span.data(result))) +} + +#[derive(Clone, Debug)] +pub struct CurlyOpen; + +#[derive(Clone, Debug)] +pub struct CurlyClose; + +#[derive(Clone)] +pub struct CurlyBrackets { + pub open: SpannedData, + pub inner: TokenList, + pub close: SpannedData, +} + +impl CurlyBrackets { + pub(super) fn parse(span: SpanRef) -> PResult> { + parse_bracketed( + "{", "}", CurlyOpen, CurlyClose, + |open, inner, close| { + Self { open, inner, close }.into() + }, + span, + ) + } +} + +impl std::fmt::Debug for CurlyBrackets { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.inner.0.is_empty() { + f.write_str("{ }") + } else { + write!(f, "{{ {:?} }}", self.inner) + } + } +} + +#[derive(Clone, Debug)] +pub struct SquareOpen; + +#[derive(Clone, Debug)] +pub struct SquareClose; + +#[derive(Clone)] +pub struct SquareBrackets { + pub open: SpannedData, + pub inner: TokenList, + pub close: SpannedData, +} + +impl SquareBrackets { + pub(super) fn parse(span: SpanRef) -> PResult> { + parse_bracketed( + "[", "]", SquareOpen, SquareClose, + |open, inner, close| { + Self { open, inner, close }.into() + }, + span, + ) + } +} + +impl std::fmt::Debug for SquareBrackets { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.inner.0.is_empty() { + f.write_str("[ ]") + } else { + write!(f, "[ {:?} ]", self.inner) + } + } +} + +#[derive(Clone, Debug)] +pub struct RoundOpen; + +#[derive(Clone, Debug)] +pub struct RoundClose; + +#[derive(Clone)] +pub struct RoundBrackets { + pub open: SpannedData, + pub inner: TokenList, + pub close: SpannedData, +} + +impl RoundBrackets { + pub(super) fn parse(span: SpanRef) -> PResult> { + parse_bracketed( + "(", ")", RoundOpen, RoundClose, + |open, inner, close| { + Self { open, inner, close }.into() + }, + span, + ) + } +} + +impl std::fmt::Debug for RoundBrackets { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.inner.0.is_empty() { + f.write_str("( )") + } else { + write!(f, "( {:?} )", self.inner) + } + } +} + +#[derive(Clone, Debug)] +pub struct InterpolateOpen; + +/// Any `${...}` expresions, whether in strings or outside +#[derive(Clone)] +pub struct Interpolate { + pub open: SpannedData, + pub inner: TokenList, + pub close: SpannedData, +} + +impl Interpolate { + pub(super) fn parse(span: SpanRef) -> PResult> { + parse_bracketed( + "${", "}", InterpolateOpen, CurlyClose, + |open, inner, close| { + Self { open, inner, close } + }, + span, + ) + } +} + +impl std::fmt::Debug for Interpolate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.inner.0.is_empty() { + f.write_str("${ }") + } else { + write!(f, "${{ {:?} }}", self.inner) + } + } +} diff --git a/src/parser/token/mod.rs b/src/parser/token/mod.rs new file mode 100644 index 0000000..61509e6 --- /dev/null +++ b/src/parser/token/mod.rs @@ -0,0 +1,157 @@ +mod brackets; +mod number; +mod op_kw_ident; +mod path; +mod strings; +mod tokenlist; + +pub use self::{ + brackets::{ + CurlyOpen, + CurlyClose, + CurlyBrackets, + SquareOpen, + SquareClose, + SquareBrackets, + RoundOpen, + RoundClose, + RoundBrackets, + InterpolateOpen, + Interpolate, + }, + op_kw_ident::SimpleToken, + path::Path, + tokenlist::TokenList, + strings::{ + Literal, + StringPart, + }, +}; + +use super::{ + source::{Span, SpanRef}, + Number, + SpannedData, + Identifier, +}; + +trait SpanExt { + fn data(self, data: T) -> SpannedData; +} + +impl SpanExt for SpanRef<'_> { + fn data(self, data: T) -> SpannedData { + SpannedData { data: data, span: self.into() } + } +} + +struct Spanned { + f: F, + o: std::marker::PhantomData, +} + +impl<'a, F, O, E> nom::Parser, SpannedData, E> for Spanned +where + F: nom::Parser, O, E>, +{ + fn parse(&mut self, input: SpanRef<'a>) -> nom::IResult, SpannedData, E> { + use nom::{Offset, Slice}; + + match self.f.parse(input) { + Ok((remaining, result)) => { + let index = input.offset(&remaining); + let consumed = input.slice(..index); + Ok((remaining, consumed.data(result))) + } + Err(e) => Err(e), + } + } +} + +trait ParserExt<'a, O, E> { + fn spanned(self) -> Spanned + where + Self: Sized, + { + Spanned { f: self, o: std::marker::PhantomData } + } +} + +impl<'a, O, E, T> ParserExt<'_, O, E> for T +where + T: nom::Parser, O, E> +{ +} + +trait IResultExt { + fn unrecoverable(self) -> Self; +} + +impl IResultExt for Result> { + fn unrecoverable(self) -> Self { + match self { + Err(nom::Err::Error(e)) => Err(nom::Err::Failure(e)), + v => v, + } + } +} + +#[derive(Clone, derivative::Derivative)] +#[derivative(Debug)] +pub enum Token { + #[derivative(Debug="transparent")] + SimpleToken(SimpleToken), + #[derivative(Debug="transparent")] + Number(Number), + #[derivative(Debug="transparent")] + Identifier(Identifier), + #[derivative(Debug="transparent")] + Path(Path), + /// `"..."` (might have been ''..'' or URI in source) + String(Vec), + /// `${...}` + #[derivative(Debug="transparent")] + Interpolate(Interpolate), + /// `{ ... }` + #[derivative(Debug="transparent")] + CurlyBrackets(CurlyBrackets), + /// `[ ... ]` + #[derivative(Debug="transparent")] + SquareBrackets(SquareBrackets), + /// `( ... )` + #[derivative(Debug="transparent")] + RoundBrackets(RoundBrackets), +} + +macro_rules! to_token { + ($($id:ident,)*) => { $( + impl From<$id> for Token { + fn from(t: $id) -> Self { + Self::$id(t) + } + } + + impl From> for SpannedData { + fn from(t: SpannedData<$id>) -> Self { + SpannedData { + data: Token::$id(t.data), + span: t.span, + } + } + } + )* }; +} + +to_token!{ + SimpleToken, + Number, + Identifier, + Path, + Interpolate, + CurlyBrackets, + SquareBrackets, + RoundBrackets, +} + +// pub type PResult = nom::IResult>; +pub type PResult<'a, T> = nom::IResult, T>; diff --git a/src/parser/token/number.rs b/src/parser/token/number.rs new file mode 100644 index 0000000..e3d782c --- /dev/null +++ b/src/parser/token/number.rs @@ -0,0 +1,48 @@ +use nom::Parser; +use super::{ + SpanRef, + PResult, + Token, + Number, + IResultExt, + SpannedData, + SpanExt, +}; + +// should come after path +fn parse_number_span(span: SpanRef) -> PResult<()> { + // if not a path, everything that starts with an optional '-', optional '.' + // followed by digits is a number. + + let (span, _) = nom::sequence::tuple(( + nom::combinator::opt(nom::bytes::complete::tag("-")), + nom::combinator::opt(nom::bytes::complete::tag(".")), + nom::character::complete::digit1, + ))(span)?; + + // if we fail now, fail hard (upstream nix parses something crazy here). + // take up all alpha characters too, should be separated by something. + let (span, _) = nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::bytes::complete::tag(".").map(|_| ()), + nom::bytes::complete::tag("e").map(|_| ()), + )))(span).unrecoverable()?; + + Ok((span, ())) +} + +impl Number { + // should come after path + pub(super) fn parse(span: SpanRef) -> PResult> { + let (rem_span, num_span) = nom::combinator::recognize(parse_number_span)(span)?; + let num_s = num_span.as_str(); + let num = if let Ok(num) = num_s.parse() { + Number::Integer(num) + } else if let Ok(num) = num_s.parse() { + Number::Float(num) + } else { + return nom::combinator::fail(span).unrecoverable(); + }; + Ok((rem_span, num_span.data(Token::Number(num)))) + } +} diff --git a/src/parser/token/op_kw_ident.rs b/src/parser/token/op_kw_ident.rs new file mode 100644 index 0000000..1c0233c --- /dev/null +++ b/src/parser/token/op_kw_ident.rs @@ -0,0 +1,161 @@ +use nom::Parser; +use super::{ + SpanRef, + PResult, + SpannedData, + Token, + SpanExt, + Identifier, +}; + +#[derive(Clone, Copy, Debug)] +/// Keywords / operators we tokenize as "standalone" +pub enum SimpleToken { + /// `=` + Assign, + /// `:` + Colon, + /// `;` + SemiColon, + /// `@` - lambda parameter alias + At, + /// `.` - attribute selection + Dot, + /// `?` - has attribute + QuestionMark, + /// `//` - attribute set update + DoubleSlash, + /// `++` - list concatenation + DoublePlus, + + /// `*` + Multiplication, + /// `/` + Division, + /// `+` + Plus, + /// `-` + Minus, + + /// `<` + LessThan, + /// `<=` + LessThanOrEqual, + /// `>` + GreaterThan, + /// `>=` + GreaterThanOrEqual, + /// `==` + Equal, + /// `!=` + NotEqual, + + /// `!` + LogicNot, + /// `&&` + LogicAnd, + /// `||` + LogicOr, + /// `->` (`a -> b` == `!a || b`) + LogicImplication, + + /// `or` - attribute selection fallback + KwOr, + /// `let` + KwLet, + /// `with` + KwWith, + /// `rec` + KwRec, + /// `inherit` + KwInherit, + /// `if` + KwIf, + /// `then` + KwThen, + /// `else` + KwElse, + /// `assert` + KwAssert, +} + +// this also finds (some) path prefixes - path alternative should come before +fn ident_or_keyword(span: SpanRef) -> PResult> { + let (rem_span, ident_span) = nom::combinator::recognize( + nom::sequence::pair( + nom::branch::alt(( + nom::character::complete::alpha1.map(|_| ()), + nom::bytes::complete::tag("_").map(|_| ()), + )), + nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::bytes::complete::tag("_").map(|_| ()), + nom::bytes::complete::tag("-").map(|_| ()), + ))) + ) + )(span)?; + let t = match ident_span.as_str() { + "or" => SimpleToken::KwOr, + "let" => SimpleToken::KwLet, + "with" => SimpleToken::KwWith, + "rec" => SimpleToken::KwRec, + "inherit" => SimpleToken::KwInherit, + "if" => SimpleToken::KwIf, + "then" => SimpleToken::KwThen, + "else" => SimpleToken::KwElse, + "assert" => SimpleToken::KwAssert, + ident => return Ok(( + rem_span, + ident_span.data(Token::Identifier(Identifier::from_ref(ident))), + )), + }; + Ok((rem_span, ident_span.data(Token::SimpleToken(t)))) +} + +fn simple_tagged(tag: &'static str, t: SimpleToken) -> impl Fn(SpanRef) -> PResult> { + move |span| { + let (rem_span, token_span) = nom::bytes::complete::tag(tag)(span)?; + Ok((rem_span, token_span.data(Token::SimpleToken(t)))) + } +} + +fn simple_op(span: SpanRef) -> PResult> { + nom::branch::alt(( + nom::branch::alt(( + simple_tagged(":", SimpleToken::Colon), + simple_tagged(";", SimpleToken::SemiColon), + simple_tagged("@", SimpleToken::At), + simple_tagged(".", SimpleToken::Dot), + simple_tagged("?", SimpleToken::QuestionMark), + simple_tagged("//", SimpleToken::DoubleSlash), + simple_tagged("++", SimpleToken::DoublePlus), + + simple_tagged("*", SimpleToken::Multiplication), + simple_tagged("/", SimpleToken::Division), + simple_tagged("+", SimpleToken::Plus), + simple_tagged("-", SimpleToken::Minus), + )), + nom::branch::alt(( + simple_tagged("<=", SimpleToken::LessThanOrEqual), + simple_tagged("<", SimpleToken::LessThan), + simple_tagged(">=", SimpleToken::GreaterThanOrEqual), + simple_tagged(">", SimpleToken::GreaterThan), + simple_tagged("==", SimpleToken::Equal), + simple_tagged("!=", SimpleToken::NotEqual), + + simple_tagged("=", SimpleToken::Assign), + + simple_tagged("!", SimpleToken::LogicNot), + simple_tagged("&&", SimpleToken::LogicAnd), + simple_tagged("||", SimpleToken::LogicOr), + simple_tagged("->", SimpleToken::LogicImplication), + )), + ))(span) +} + +pub(super) fn op_ident_or_keyword(span: SpanRef) -> PResult> { + nom::branch::alt(( + simple_op, + ident_or_keyword, + ))(span) +} diff --git a/src/parser/token/path.rs b/src/parser/token/path.rs new file mode 100644 index 0000000..66fa38e --- /dev/null +++ b/src/parser/token/path.rs @@ -0,0 +1,175 @@ +use nom::Parser; + +use super::{ + SpannedData, + StringPart, + SpanRef, + PResult, + Literal, + SpanExt, + Interpolate, + IResultExt, + Token, +}; + +#[derive(Clone, Debug)] +pub struct Path { + pub base: SpannedData, + pub additional: Vec, +} + +fn path_separator(span: SpanRef) -> PResult { + nom::sequence::preceded( + nom::sequence::pair( + nom::combinator::not(nom::bytes::complete::tag("//")), + nom::combinator::not(nom::bytes::complete::tag("/*")), + ), + nom::bytes::complete::tag("/"), + )(span) +} + +struct PathBuilder<'a> { + span: SpanRef<'a>, + cur_lit_start: Option, + base: Option>, + additional: Vec, +} + +impl<'a> PathBuilder<'a> { + fn new(span: SpanRef<'a>) -> Self { + Self { + span, + cur_lit_start: Some(0), + base: None, + additional: Vec::new(), + } + } + + fn add_lit(&mut self, lit_span: SpanRef) { + use nom::Offset; + + if self.cur_lit_start.is_none() { + self.cur_lit_start = Some(self.span.offset(&lit_span)); + } + } + + fn _end_lit(&mut self, next_span: SpanRef) { + use nom::{Offset, Slice}; + + if let Some(start) = self.cur_lit_start.take() { + let end = self.span.offset(&next_span); + let lit_span = self.span.slice(start..end); + let lit = lit_span.data(lit_span.as_str().into()); + if self.additional.is_empty() { + assert!(self.base.is_none()); + self.base = Some(lit); + } else { + self.additional.push(StringPart::Literal(Literal::from(lit_span))); + } + } + } + + fn add_interp(&mut self, span: SpanRef, interp: Interpolate) { + self._end_lit(span); + + assert!(self.base.is_some()); + self.additional.push(StringPart::Interpolate(span.data(interp))); + } + + fn build(mut self, rem_span: SpanRef<'_>) -> SpannedData { + use nom::{Offset, Slice}; + + self._end_lit(rem_span); + let path = Path { + base: self.base.take().expect("base can't be empty here"), + additional: self.additional, + }; + + let end = self.span.offset(&rem_span); + let path_span = self.span.slice(..end); + + path_span.data(path) + } +} + +impl Path { + pub(super) fn parse(span: SpanRef) -> PResult> { + // first segment before a '/' - possibly empty + let mut first_segment = nom::combinator::opt( + nom::branch::alt(( + // `~` only allowed as first (full) segment + nom::bytes::complete::tag("~").map(|_| ()), + nom::sequence::pair( + nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::bytes::complete::tag("-").map(|_| ()), + nom::bytes::complete::tag("_").map(|_| ()), + nom::bytes::complete::tag(".").map(|_| ()), + )), + nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::bytes::complete::tag("-").map(|_| ()), + nom::bytes::complete::tag("_").map(|_| ()), + nom::bytes::complete::tag(".").map(|_| ()), + ))), + ).map(|_| ()), + )) + ); + + // segments after the first / contain combinations of literal parts and ${...} expressions + let mut later_segment_literal = nom::combinator::recognize( + nom::multi::many1_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::bytes::complete::tag("-").map(|_| ()), + nom::bytes::complete::tag("_").map(|_| ()), + nom::bytes::complete::tag(".").map(|_| ()), + ))), + ); + + let (mut rem_span, _) = first_segment(span)?; + path_separator(rem_span)?; // shortcut if it can't be a path + + let mut found_separators = 0; + let mut path = PathBuilder::new(span); + + while let Ok((next_span, sep_span)) = path_separator(rem_span) { + found_separators += 1; + path.add_lit(sep_span); + rem_span = next_span; + let mut parts = 0; + loop { + if let Ok((next_span, (interp_span, interp))) = nom::combinator::consumed(Interpolate::parse)(rem_span) { + path.add_interp(interp_span, interp.data); + rem_span = next_span; + parts += 1; + continue; + } + match later_segment_literal(rem_span) as PResult> { + Ok((next_span, lit_span)) => { + path.add_lit(lit_span); + rem_span = next_span; + parts += 1; + }, + Err(_e) => { + if parts == 0 { + // trailing slash + if found_separators == 1 { + // only one slash, and it is trailing -> not a path. + return nom::combinator::fail(rem_span); + } else { + // invalid path - trailing slash not allowed + // TODO: proper error message + return nom::combinator::fail(rem_span).unrecoverable(); + } + } + break + } + } + } + } + + assert!(found_separators >= 1); // we check for initial separator above + + Ok((rem_span, path.build(rem_span).into())) + } +} diff --git a/src/parser/token/strings.rs b/src/parser/token/strings.rs new file mode 100644 index 0000000..b739fdc --- /dev/null +++ b/src/parser/token/strings.rs @@ -0,0 +1,397 @@ +use nom::Parser; + +use super::{ + Span, + SpanRef, + SpanExt, + PResult, + Token, + SpannedData, + Interpolate, IResultExt, +}; + +#[derive(Clone)] +pub struct Literal { + span: Span, +} + +impl Literal { + pub fn as_str(&self) -> &str { + self.span.as_str() + } +} + +impl std::ops::Deref for Literal { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.span.as_str() + } +} + +impl From> for Literal { + fn from(span_ref: SpanRef) -> Self { + Self { span: span_ref.into() } + } +} + +impl std::fmt::Debug for Literal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +#[derive(Clone, derivative::Derivative)] +#[derivative(Debug)] +pub enum StringPart { + #[derivative(Debug="transparent")] + Literal(Literal), + #[derivative(Debug="transparent")] + Escaped(char), + #[derivative(Debug="transparent")] + Interpolate(SpannedData), +} + +pub(super) struct StringBuilder<'a> { + span: SpanRef<'a>, + parts: Vec, + cur_lit: Option>, +} + +impl<'a> StringBuilder<'a> { + fn new(span: SpanRef<'a>) -> Self { + Self { + span, + parts: Vec::new(), + cur_lit: None, + } + } + + fn add_lit(&mut self, span: SpanRef<'a>) { + use nom::{Offset, Slice}; + + let start = self.span.offset(&span); + let mut next = start..start + span.as_str().len(); + if let Some(cur) = self.cur_lit.take() { + if cur.end == next.start { + next.start = cur.start; + } else { + self.parts.push(StringPart::Literal(Literal::from(self.span.slice(cur)))); + } + } + self.cur_lit = Some(next); + } + + fn _end_lit(&mut self) { + use nom::Slice; + + if let Some(cur) = self.cur_lit.take() { + self.parts.push(StringPart::Literal(Literal::from(self.span.slice(cur)))); + } + } + + fn add_escaped(&mut self, ch: char) { + self._end_lit(); + self.parts.push(StringPart::Escaped(ch)) + } + + fn add_interp(&mut self, interp: SpannedData) { + self._end_lit(); + self.parts.push(StringPart::Interpolate(interp)); + } + + fn finish(mut self, rem_span: SpanRef) -> (SpanRef<'a>, Vec) { + use nom::{Offset, Slice}; + self._end_lit(); + let length = self.span.offset(&rem_span); + let span = self.span.slice(..length); + (span, self.parts) + } +} + +fn parse_dq(span: SpanRef) -> PResult> { + let (mut rem_span, _open_span) = nom::bytes::complete::tag("\"")(span)?; + let mut sb = StringBuilder::new(span); + + loop { + if let Ok((rem_span, _close_span)) = nom::bytes::complete::tag("\"")(rem_span) as PResult<_> { + let (span, parts) = sb.finish(rem_span); + return Ok((rem_span, span.data(Token::String(parts)))); + } + match Interpolate::parse(rem_span) { + Ok((r, interp)) => { + rem_span = r; + sb.add_interp(interp); + continue; + }, + Err(nom::Err::Failure(f)) => return Err(nom::Err::Failure(f)), + Err(_) => (), // wasn't a ${ ... }, fall through + } + if let Ok((r, _)) = nom::bytes::complete::tag("\\")(rem_span) as PResult<_> { + let (r, (escaped_span, escaped)) = nom::combinator::consumed( + nom::character::complete::anychar + )(r).unrecoverable()?; + rem_span = r; + match escaped { + 'n' => sb.add_escaped('\n'), + 'r' => sb.add_escaped('\r'), + 't' => sb.add_escaped('\t'), + '"'|'\\'|'$' => { + // must be escaped + sb.add_lit(escaped_span); + }, + _ => { + // useless escape + sb.add_lit(escaped_span); + } + } + continue; + } + let (r, lit_span) = nom::bytes::complete::is_not("\"\\$")(rem_span).unrecoverable()?; + rem_span = r; + sb.add_lit(lit_span); + } +} + +fn lit_remove_shared_ident(at_start: bool, prefix_len: usize, lit: &Literal) -> impl Iterator + '_ { + use nom::Slice; + + let mut offset = 0; + let lit_str = lit.as_str(); + std::iter::from_fn(move || { + // if offset > 0 we set it there after we found a `\n` - i.e. always start of a line + // if offset = 0 it depends on at_start: + let at_line_start = offset != 0 || at_start; + let remaining = &lit_str[offset..]; + if remaining.is_empty() { return None; } + let result: Literal; + if let Some(line_len) = remaining.find('\n') { + let abs_end = offset+line_len+1; + if at_line_start { + let line_offset = prefix_len.min(line_len); // might be an empty line without full prefix + result = Literal { span: lit.span.slice(offset+line_offset..abs_end) }; + } else { + // not at line start, nothing to remove + result = Literal { span: lit.span.slice(offset..abs_end) }; + } + offset = abs_end; + } else if at_line_start { + // not an "empty line" (apart from spaces), i.e. prefix must be here completely + assert!(remaining.len() >= prefix_len); + result = Literal { span: lit.span.slice(offset+prefix_len..) }; + offset = lit_str.len(); // end iterator + } else { + // not at line start, nothing to remove + result = Literal { span: lit.span.slice(offset..) }; + offset = lit_str.len(); // end iterator + } + Some(StringPart::Literal(result)) + }) +} + +fn remove_shared_ident(parts: &mut Vec) { + use nom::Slice; + + // remove trailing spaces after the last newline + if let Some(StringPart::Literal(last_lit)) = parts.last_mut() { + if let Some(last_non_space) = last_lit.rfind(|c| c != ' ') { + if last_lit.as_bytes()[last_non_space] == b'\n' { + *last_lit = Literal { span: last_lit.span.slice(..last_non_space+1) }; + } + } + } + + let mut at_start = true; + let mut at_line_start = true; + let mut current_max_prefix = None; + for part in parts.iter() { + if at_line_start && !at_start { + // the parser would not have splitted a literal ending in `\n` and + // the next one starting with ` ` + // i.e. there shouldn't be a literal coming right now. + // -> empty prefix, nothing to remove + return; + } + at_start = false; + if let StringPart::Literal(lit) = part { + let lit_str = lit.as_str(); + let mut lines = lit_str.split('\n'); + if !at_line_start { + // if we weren't at the start of a line skip the + // first part before a '\n'. + // if there is no '\n' no other parts will follow, + // and at_line_start stays false. + let _ = lines.next(); + } + for line in lines { + // we are now at the start of a line + // (either we were at a start before, or the first part was skipped) + + // if there is nothing else than ' ' - ignore line for prefix calculation. + if let Some(prefix_len) = line.find(|c| c != ' ') { + if prefix_len == 0 { + // empty prefix, nothing to remove + return; + } + if let Some(cur_prefix_len) = current_max_prefix { + current_max_prefix = Some(prefix_len.min(cur_prefix_len)); + } + } + + // the next iteration will always be at the start of a line, + // but if this is the last iteration, at_line_start is true afterwards + // only if this part is empty: + at_line_start = line.is_empty(); + } + } else if at_line_start { + // empty prefix, nothing to remove + return; + } + } + + let prefix_len = match current_max_prefix { + None => return, // no literal parts -> no prefixes + Some(v) => v, + }; + assert!(prefix_len > 0); + + let mut index = 0; + let mut at_start = true; + while index < parts.len() { + if let StringPart::Literal(lit) = parts[index].clone() { + let mut clipped_parts = lit_remove_shared_ident(at_start, prefix_len, &lit); + if let Some(part) = clipped_parts.next() { + parts[index] = part; + index += 1; + for part in clipped_parts { + parts.insert(index, part); + index += 1; + } + } else { + parts.remove(index); + } + } else { + index += 1; + } + at_start = false; + } +} + +fn parse_two_sq(span: SpanRef) -> PResult> { + use nom::Slice; + + let (mut rem_span, _open_span) = nom::bytes::complete::tag("''")(span)?; + let mut sb = StringBuilder::new(span); + + // skip first line if it only contains " " (or is empty) + let (r, _) = nom::combinator::opt( + nom::sequence::pair( + nom::combinator::opt(nom::bytes::complete::is_a(" ")), + nom::bytes::complete::tag("\n"), + ), + )(rem_span)?; + rem_span = r; + + loop { + if let Ok((r, escaped_two_sq)) = nom::bytes::complete::tag("'''")(rem_span) as PResult { + // '' is escaped by a single ' + rem_span = r; + sb.add_lit(escaped_two_sq.slice(1..)); + continue; + } + if let Ok((r, escaped_dollar)) = nom::bytes::complete::tag("''$")(rem_span) as PResult { + // $ is escaped by '' + rem_span = r; + sb.add_lit(escaped_dollar.slice(2..)); + continue; + } + if let Ok((r, _escape)) = nom::bytes::complete::tag("''\\")(rem_span) as PResult { + // ''\ is the generic escape for the following character + let (r, (escaped_span, escaped)) = nom::combinator::consumed( + nom::character::complete::anychar + )(r).unrecoverable()?; + rem_span = r; + match escaped { + 'n' => sb.add_escaped('\n'), + 'r' => sb.add_escaped('\r'), + 't' => sb.add_escaped('\t'), + ' ' => sb.add_escaped(' '), // not part of the indent, add as escaped part + _ => { + // useless escape - \ doesn't need an escape, $ should be ''$, ... + sb.add_lit(escaped_span); + } + } + continue; + } + if let Ok((r, two_dollar)) = nom::bytes::complete::tag("$$")(rem_span) as PResult<_> { + // $$ is passed through as literal string, $${..} not parsed as interpolation + rem_span = r; + sb.add_lit(two_dollar); + continue; + } + + if let Ok((rem_span, _close_span)) = nom::bytes::complete::tag("''")(rem_span) as PResult<_> { + let (span, mut parts) = sb.finish(rem_span); + remove_shared_ident(&mut parts); + return Ok((rem_span, span.data(Token::String(parts)))); + } + if let Ok((r, lit_sq)) = nom::bytes::complete::tag("'")(rem_span) as PResult { + // ' - not followed by another ' + rem_span = r; + sb.add_lit(lit_sq); + continue; + } + match Interpolate::parse(rem_span) { + Ok((r, interp)) => { + rem_span = r; + sb.add_interp(interp); + continue; + }, + Err(nom::Err::Failure(f)) => return Err(nom::Err::Failure(f)), + Err(_) => (), // wasn't a ${ ... }, fall through + } + + let (r, lit_span) = nom::bytes::complete::is_not("'$")(rem_span).unrecoverable()?; + rem_span = r; + sb.add_lit(lit_span); + } +} + +fn parse_uri(span: SpanRef) -> PResult> { + // nix doc says: "URIs as defined in appendix B of RFC 2396"; + // but the appendix only gives a regex to **split** valid URIs, + // not one to properly find them in the first place. + // it also would match relative URIs and so on - we should only accept absolute URIs. + + // regex to split from appendix b: ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + // nix upstream uses: [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+ + + let (rem_span, uri_span) = nom::combinator::recognize(nom::sequence::tuple(( + // scheme + nom::character::complete::alpha1, + nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::character::complete::one_of("+-.").map(|_| ()), + ))), + // ":" + nom::bytes::complete::tag(":"), + // [-a-zA-Z0-9%/?:@&=+$,_.!~*']+ + nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::alphanumeric1.map(|_| ()), + nom::character::complete::one_of("-%/?:@&=+$,_.!~*'").map(|_| ()), + ))), + )))(span)?; + + let uri_lit = Literal::from(uri_span); + let uri = Token::String(vec![StringPart::Literal(uri_lit)]); + + Ok((rem_span, uri_span.data(uri))) +} + +pub(super) fn parse_string(span: SpanRef) -> PResult> { + nom::branch::alt(( + parse_dq, + parse_two_sq, + parse_uri, + ))(span) +} diff --git a/src/parser/token/tokenlist.rs b/src/parser/token/tokenlist.rs new file mode 100644 index 0000000..4bbdb4f --- /dev/null +++ b/src/parser/token/tokenlist.rs @@ -0,0 +1,68 @@ +use nom::Parser; +use super::{ + SpannedData, + Token, + SpanRef, + PResult, +}; + +fn parse_token(span: SpanRef) -> PResult> { + nom::branch::alt(( + super::strings::parse_string, + super::Path::parse, + super::op_kw_ident::op_ident_or_keyword, + super::Number::parse, + super::CurlyBrackets::parse, + super::SquareBrackets::parse, + super::RoundBrackets::parse, + Parser::into(super::Interpolate::parse), + ))(span) +} + +fn skip_ws(span: SpanRef) -> PResult<()> { + nom::multi::many0_count(nom::branch::alt(( + nom::character::complete::multispace1.map(|_| ()), + // `# ...` comments + nom::sequence::pair( + nom::bytes::complete::tag("#"), + nom::bytes::complete::is_not("\n\r"), + ).map(|_| ()), + // /* ... */ comments + nom::sequence::tuple(( + nom::bytes::complete::tag("/*"), + nom::bytes::complete::take_until("*/"), + nom::bytes::complete::tag("*/"), + )).map(|_| ()), + ))).map(|_| ()).parse(span) +} + +#[derive(Clone)] +pub struct TokenList(pub Vec>); + +impl TokenList { + pub fn parse_expression(span: SpanRef) -> PResult { + nom::sequence::preceded( + skip_ws, + nom::multi::many0( + nom::sequence::terminated(parse_token, skip_ws) + ), + ).map(Self).parse(span) + } + + pub fn parse_file(span: SpanRef) -> PResult { + nom::combinator::all_consuming(Self::parse_expression)(span) + } +} + +impl std::fmt::Debug for TokenList { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(head) = self.0.first() { + head.fmt(f)?; + for elem in &self.0[1..] { + f.write_str(" ")?; + elem.fmt(f)?; + } + } + Ok(()) + } +} diff --git a/test.nix b/test.nix new file mode 100644 index 0000000..fa519e6 --- /dev/null +++ b/test.nix @@ -0,0 +1,12 @@ +{ s = '' + abc + + def +''; + t = '' + abc + ''; + r = '' ''; + r1 = '' + abc ''; +}