398 lines
11 KiB
Rust
398 lines
11 KiB
Rust
use nom::Parser;
|
|
|
|
use super::{
|
|
Span,
|
|
SpanRef,
|
|
SpanExt,
|
|
PResult,
|
|
Token,
|
|
SpannedData,
|
|
Interpolate, IResultExt,
|
|
};
|
|
|
|
#[derive(Clone)]
|
|
pub struct Literal {
|
|
span: Span,
|
|
}
|
|
|
|
impl Literal {
|
|
pub fn as_str(&self) -> &str {
|
|
self.span.as_str()
|
|
}
|
|
}
|
|
|
|
impl std::ops::Deref for Literal {
|
|
type Target = str;
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
self.span.as_str()
|
|
}
|
|
}
|
|
|
|
impl From<SpanRef<'_>> for Literal {
|
|
fn from(span_ref: SpanRef) -> Self {
|
|
Self { span: span_ref.into() }
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Debug for Literal {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
self.as_str().fmt(f)
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, derivative::Derivative)]
|
|
#[derivative(Debug)]
|
|
pub enum StringPart {
|
|
#[derivative(Debug="transparent")]
|
|
Literal(Literal),
|
|
#[derivative(Debug="transparent")]
|
|
Escaped(char),
|
|
#[derivative(Debug="transparent")]
|
|
Interpolate(SpannedData<Interpolate>),
|
|
}
|
|
|
|
pub(super) struct StringBuilder<'a> {
|
|
span: SpanRef<'a>,
|
|
parts: Vec<StringPart>,
|
|
cur_lit: Option<std::ops::Range<usize>>,
|
|
}
|
|
|
|
impl<'a> StringBuilder<'a> {
|
|
fn new(span: SpanRef<'a>) -> Self {
|
|
Self {
|
|
span,
|
|
parts: Vec::new(),
|
|
cur_lit: None,
|
|
}
|
|
}
|
|
|
|
fn add_lit(&mut self, span: SpanRef<'a>) {
|
|
use nom::{Offset, Slice};
|
|
|
|
let start = self.span.offset(&span);
|
|
let mut next = start..start + span.as_str().len();
|
|
if let Some(cur) = self.cur_lit.take() {
|
|
if cur.end == next.start {
|
|
next.start = cur.start;
|
|
} else {
|
|
self.parts.push(StringPart::Literal(Literal::from(self.span.slice(cur))));
|
|
}
|
|
}
|
|
self.cur_lit = Some(next);
|
|
}
|
|
|
|
fn _end_lit(&mut self) {
|
|
use nom::Slice;
|
|
|
|
if let Some(cur) = self.cur_lit.take() {
|
|
self.parts.push(StringPart::Literal(Literal::from(self.span.slice(cur))));
|
|
}
|
|
}
|
|
|
|
fn add_escaped(&mut self, ch: char) {
|
|
self._end_lit();
|
|
self.parts.push(StringPart::Escaped(ch))
|
|
}
|
|
|
|
fn add_interp(&mut self, interp: SpannedData<Interpolate>) {
|
|
self._end_lit();
|
|
self.parts.push(StringPart::Interpolate(interp));
|
|
}
|
|
|
|
fn finish(mut self, rem_span: SpanRef) -> (SpanRef<'a>, Vec<StringPart>) {
|
|
use nom::{Offset, Slice};
|
|
self._end_lit();
|
|
let length = self.span.offset(&rem_span);
|
|
let span = self.span.slice(..length);
|
|
(span, self.parts)
|
|
}
|
|
}
|
|
|
|
fn parse_dq(span: SpanRef) -> PResult<SpannedData<Token>> {
|
|
let (mut rem_span, _open_span) = nom::bytes::complete::tag("\"")(span)?;
|
|
let mut sb = StringBuilder::new(span);
|
|
|
|
loop {
|
|
if let Ok((rem_span, _close_span)) = nom::bytes::complete::tag("\"")(rem_span) as PResult<_> {
|
|
let (span, parts) = sb.finish(rem_span);
|
|
return Ok((rem_span, span.data(Token::String(parts))));
|
|
}
|
|
match Interpolate::parse(rem_span) {
|
|
Ok((r, interp)) => {
|
|
rem_span = r;
|
|
sb.add_interp(interp);
|
|
continue;
|
|
},
|
|
Err(nom::Err::Failure(f)) => return Err(nom::Err::Failure(f)),
|
|
Err(_) => (), // wasn't a ${ ... }, fall through
|
|
}
|
|
if let Ok((r, _)) = nom::bytes::complete::tag("\\")(rem_span) as PResult<_> {
|
|
let (r, (escaped_span, escaped)) = nom::combinator::consumed(
|
|
nom::character::complete::anychar
|
|
)(r).unrecoverable()?;
|
|
rem_span = r;
|
|
match escaped {
|
|
'n' => sb.add_escaped('\n'),
|
|
'r' => sb.add_escaped('\r'),
|
|
't' => sb.add_escaped('\t'),
|
|
'"'|'\\'|'$' => {
|
|
// must be escaped
|
|
sb.add_lit(escaped_span);
|
|
},
|
|
_ => {
|
|
// useless escape
|
|
sb.add_lit(escaped_span);
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
let (r, lit_span) = nom::bytes::complete::is_not("\"\\$")(rem_span).unrecoverable()?;
|
|
rem_span = r;
|
|
sb.add_lit(lit_span);
|
|
}
|
|
}
|
|
|
|
fn lit_remove_shared_ident(at_start: bool, prefix_len: usize, lit: &Literal) -> impl Iterator<Item=StringPart> + '_ {
|
|
use nom::Slice;
|
|
|
|
let mut offset = 0;
|
|
let lit_str = lit.as_str();
|
|
std::iter::from_fn(move || {
|
|
// if offset > 0 we set it there after we found a `\n` - i.e. always start of a line
|
|
// if offset = 0 it depends on at_start:
|
|
let at_line_start = offset != 0 || at_start;
|
|
let remaining = &lit_str[offset..];
|
|
if remaining.is_empty() { return None; }
|
|
let result: Literal;
|
|
if let Some(line_len) = remaining.find('\n') {
|
|
let abs_end = offset+line_len+1;
|
|
if at_line_start {
|
|
let line_offset = prefix_len.min(line_len); // might be an empty line without full prefix
|
|
result = Literal { span: lit.span.slice(offset+line_offset..abs_end) };
|
|
} else {
|
|
// not at line start, nothing to remove
|
|
result = Literal { span: lit.span.slice(offset..abs_end) };
|
|
}
|
|
offset = abs_end;
|
|
} else if at_line_start {
|
|
// not an "empty line" (apart from spaces), i.e. prefix must be here completely
|
|
assert!(remaining.len() >= prefix_len);
|
|
result = Literal { span: lit.span.slice(offset+prefix_len..) };
|
|
offset = lit_str.len(); // end iterator
|
|
} else {
|
|
// not at line start, nothing to remove
|
|
result = Literal { span: lit.span.slice(offset..) };
|
|
offset = lit_str.len(); // end iterator
|
|
}
|
|
Some(StringPart::Literal(result))
|
|
})
|
|
}
|
|
|
|
fn remove_shared_ident(parts: &mut Vec<StringPart>) {
|
|
use nom::Slice;
|
|
|
|
// remove trailing spaces after the last newline
|
|
if let Some(StringPart::Literal(last_lit)) = parts.last_mut() {
|
|
if let Some(last_non_space) = last_lit.rfind(|c| c != ' ') {
|
|
if last_lit.as_bytes()[last_non_space] == b'\n' {
|
|
*last_lit = Literal { span: last_lit.span.slice(..last_non_space+1) };
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut at_start = true;
|
|
let mut at_line_start = true;
|
|
let mut current_max_prefix = None;
|
|
for part in parts.iter() {
|
|
if at_line_start && !at_start {
|
|
// the parser would not have splitted a literal ending in `\n` and
|
|
// the next one starting with ` `
|
|
// i.e. there shouldn't be a literal coming right now.
|
|
// -> empty prefix, nothing to remove
|
|
return;
|
|
}
|
|
at_start = false;
|
|
if let StringPart::Literal(lit) = part {
|
|
let lit_str = lit.as_str();
|
|
let mut lines = lit_str.split('\n');
|
|
if !at_line_start {
|
|
// if we weren't at the start of a line skip the
|
|
// first part before a '\n'.
|
|
// if there is no '\n' no other parts will follow,
|
|
// and at_line_start stays false.
|
|
let _ = lines.next();
|
|
}
|
|
for line in lines {
|
|
// we are now at the start of a line
|
|
// (either we were at a start before, or the first part was skipped)
|
|
|
|
// if there is nothing else than ' ' - ignore line for prefix calculation.
|
|
if let Some(prefix_len) = line.find(|c| c != ' ') {
|
|
if prefix_len == 0 {
|
|
// empty prefix, nothing to remove
|
|
return;
|
|
}
|
|
if let Some(cur_prefix_len) = current_max_prefix {
|
|
current_max_prefix = Some(prefix_len.min(cur_prefix_len));
|
|
}
|
|
}
|
|
|
|
// the next iteration will always be at the start of a line,
|
|
// but if this is the last iteration, at_line_start is true afterwards
|
|
// only if this part is empty:
|
|
at_line_start = line.is_empty();
|
|
}
|
|
} else if at_line_start {
|
|
// empty prefix, nothing to remove
|
|
return;
|
|
}
|
|
}
|
|
|
|
let prefix_len = match current_max_prefix {
|
|
None => return, // no literal parts -> no prefixes
|
|
Some(v) => v,
|
|
};
|
|
assert!(prefix_len > 0);
|
|
|
|
let mut index = 0;
|
|
let mut at_start = true;
|
|
while index < parts.len() {
|
|
if let StringPart::Literal(lit) = parts[index].clone() {
|
|
let mut clipped_parts = lit_remove_shared_ident(at_start, prefix_len, &lit);
|
|
if let Some(part) = clipped_parts.next() {
|
|
parts[index] = part;
|
|
index += 1;
|
|
for part in clipped_parts {
|
|
parts.insert(index, part);
|
|
index += 1;
|
|
}
|
|
} else {
|
|
parts.remove(index);
|
|
}
|
|
} else {
|
|
index += 1;
|
|
}
|
|
at_start = false;
|
|
}
|
|
}
|
|
|
|
fn parse_two_sq(span: SpanRef) -> PResult<SpannedData<Token>> {
|
|
use nom::Slice;
|
|
|
|
let (mut rem_span, _open_span) = nom::bytes::complete::tag("''")(span)?;
|
|
let mut sb = StringBuilder::new(span);
|
|
|
|
// skip first line if it only contains " " (or is empty)
|
|
let (r, _) = nom::combinator::opt(
|
|
nom::sequence::pair(
|
|
nom::combinator::opt(nom::bytes::complete::is_a(" ")),
|
|
nom::bytes::complete::tag("\n"),
|
|
),
|
|
)(rem_span)?;
|
|
rem_span = r;
|
|
|
|
loop {
|
|
if let Ok((r, escaped_two_sq)) = nom::bytes::complete::tag("'''")(rem_span) as PResult<SpanRef> {
|
|
// '' is escaped by a single '
|
|
rem_span = r;
|
|
sb.add_lit(escaped_two_sq.slice(1..));
|
|
continue;
|
|
}
|
|
if let Ok((r, escaped_dollar)) = nom::bytes::complete::tag("''$")(rem_span) as PResult<SpanRef> {
|
|
// $ is escaped by ''
|
|
rem_span = r;
|
|
sb.add_lit(escaped_dollar.slice(2..));
|
|
continue;
|
|
}
|
|
if let Ok((r, _escape)) = nom::bytes::complete::tag("''\\")(rem_span) as PResult<SpanRef> {
|
|
// ''\ is the generic escape for the following character
|
|
let (r, (escaped_span, escaped)) = nom::combinator::consumed(
|
|
nom::character::complete::anychar
|
|
)(r).unrecoverable()?;
|
|
rem_span = r;
|
|
match escaped {
|
|
'n' => sb.add_escaped('\n'),
|
|
'r' => sb.add_escaped('\r'),
|
|
't' => sb.add_escaped('\t'),
|
|
' ' => sb.add_escaped(' '), // not part of the indent, add as escaped part
|
|
_ => {
|
|
// useless escape - \ doesn't need an escape, $ should be ''$, ...
|
|
sb.add_lit(escaped_span);
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
if let Ok((r, two_dollar)) = nom::bytes::complete::tag("$$")(rem_span) as PResult<_> {
|
|
// $$ is passed through as literal string, $${..} not parsed as interpolation
|
|
rem_span = r;
|
|
sb.add_lit(two_dollar);
|
|
continue;
|
|
}
|
|
|
|
if let Ok((rem_span, _close_span)) = nom::bytes::complete::tag("''")(rem_span) as PResult<_> {
|
|
let (span, mut parts) = sb.finish(rem_span);
|
|
remove_shared_ident(&mut parts);
|
|
return Ok((rem_span, span.data(Token::String(parts))));
|
|
}
|
|
if let Ok((r, lit_sq)) = nom::bytes::complete::tag("'")(rem_span) as PResult<SpanRef> {
|
|
// ' - not followed by another '
|
|
rem_span = r;
|
|
sb.add_lit(lit_sq);
|
|
continue;
|
|
}
|
|
match Interpolate::parse(rem_span) {
|
|
Ok((r, interp)) => {
|
|
rem_span = r;
|
|
sb.add_interp(interp);
|
|
continue;
|
|
},
|
|
Err(nom::Err::Failure(f)) => return Err(nom::Err::Failure(f)),
|
|
Err(_) => (), // wasn't a ${ ... }, fall through
|
|
}
|
|
|
|
let (r, lit_span) = nom::bytes::complete::is_not("'$")(rem_span).unrecoverable()?;
|
|
rem_span = r;
|
|
sb.add_lit(lit_span);
|
|
}
|
|
}
|
|
|
|
fn parse_uri(span: SpanRef) -> PResult<SpannedData<Token>> {
|
|
// nix doc says: "URIs as defined in appendix B of RFC 2396";
|
|
// but the appendix only gives a regex to **split** valid URIs,
|
|
// not one to properly find them in the first place.
|
|
// it also would match relative URIs and so on - we should only accept absolute URIs.
|
|
|
|
// regex to split from appendix b: ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
|
|
// nix upstream uses: [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
|
|
|
|
let (rem_span, uri_span) = nom::combinator::recognize(nom::sequence::tuple((
|
|
// scheme
|
|
nom::character::complete::alpha1,
|
|
nom::multi::many0_count(nom::branch::alt((
|
|
nom::character::complete::alphanumeric1.map(|_| ()),
|
|
nom::character::complete::one_of("+-.").map(|_| ()),
|
|
))),
|
|
// ":"
|
|
nom::bytes::complete::tag(":"),
|
|
// [-a-zA-Z0-9%/?:@&=+$,_.!~*']+
|
|
nom::multi::many0_count(nom::branch::alt((
|
|
nom::character::complete::alphanumeric1.map(|_| ()),
|
|
nom::character::complete::one_of("-%/?:@&=+$,_.!~*'").map(|_| ()),
|
|
))),
|
|
)))(span)?;
|
|
|
|
let uri_lit = Literal::from(uri_span);
|
|
let uri = Token::String(vec![StringPart::Literal(uri_lit)]);
|
|
|
|
Ok((rem_span, uri_span.data(uri)))
|
|
}
|
|
|
|
pub(super) fn parse_string(span: SpanRef) -> PResult<SpannedData<Token>> {
|
|
nom::branch::alt((
|
|
parse_dq,
|
|
parse_two_sq,
|
|
parse_uri,
|
|
))(span)
|
|
}
|