242 lines
5.7 KiB
Rust
242 lines
5.7 KiB
Rust
use std::fmt;
|
|
|
|
pub struct EncodedByte {
|
|
storage: [u8; 4], // max: `\000`
|
|
used: u8,
|
|
}
|
|
|
|
impl ::std::ops::Deref for EncodedByte {
|
|
type Target = str;
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
crate::unsafe_ops::from_utf8_unchecked(&self.storage[..self.used as usize])
|
|
}
|
|
}
|
|
|
|
pub struct EncodeIterator<'a> {
|
|
encode_whitespace: bool,
|
|
data: &'a [u8],
|
|
}
|
|
|
|
impl<'a> EncodeIterator<'a> {
|
|
pub fn new_quoted(value: &'a [u8]) -> Self {
|
|
EncodeIterator {
|
|
encode_whitespace: false,
|
|
data: value,
|
|
}
|
|
}
|
|
|
|
pub fn new_encode_whitespace(value: &'a [u8]) -> Self {
|
|
EncodeIterator {
|
|
encode_whitespace: true,
|
|
data: value,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for EncodeIterator<'a> {
|
|
type Item = EncodedByte;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if self.data.is_empty() {
|
|
return None;
|
|
}
|
|
let b = self.data[0];
|
|
self.data = &self.data[1..];
|
|
if b < 32 || b >= 127 || (self.encode_whitespace && is_ascii_whitespace(b)) {
|
|
// `\ddd`
|
|
let d1 = b / 100;
|
|
let d2 = (b / 10) % 10;
|
|
let d3 = b % 10;
|
|
Some(EncodedByte {
|
|
storage: [b'\\', b'0' + d1, b'0' + d2, b'0' + d3],
|
|
used: 4,
|
|
})
|
|
} else if b == b'"' || b == b'\\' {
|
|
// `\c`
|
|
Some(EncodedByte {
|
|
storage: [b'\\', b, 0, 0],
|
|
used: 2,
|
|
})
|
|
} else {
|
|
Some(EncodedByte {
|
|
storage: [b, 0, 0, 0],
|
|
used: 1,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct UnquoteError {
|
|
data: String,
|
|
position: usize,
|
|
msg: &'static str,
|
|
}
|
|
|
|
impl fmt::Display for UnquoteError {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(
|
|
f,
|
|
"unquote error at position {} in {:?}: {}",
|
|
self.position, self.data, self.msg
|
|
)
|
|
}
|
|
}
|
|
|
|
impl failure::Fail for UnquoteError {}
|
|
|
|
pub struct UnquoteIterator<'a, 'b: 'a> {
|
|
quoted: bool,
|
|
data: &'a mut &'b str,
|
|
pos: usize,
|
|
}
|
|
|
|
/// when walked to end without hitting errors between, the terminating
|
|
/// `"` and following whitespace will be removed from `*data`.
|
|
impl<'a, 'b: 'a> UnquoteIterator<'a, 'b> {
|
|
pub fn new(data: &'a mut &'b str) -> Self {
|
|
UnquoteIterator {
|
|
quoted: false,
|
|
data: data,
|
|
pos: 0,
|
|
}
|
|
}
|
|
|
|
fn err<T>(&mut self, msg: &'static str) -> Option<Result<T, UnquoteError>> {
|
|
Some(Err(UnquoteError {
|
|
data: (*self.data).into(),
|
|
position: self.pos,
|
|
msg: msg,
|
|
}))
|
|
}
|
|
}
|
|
|
|
pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
|
|
match c {
|
|
0x09 => true, // horizontal tab: \t
|
|
0x0a => true, // line feed: \n
|
|
0x0c => true, // form feed: \f
|
|
0x0d => true, // form feed: \r
|
|
0x20 => true, // space: ' '
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
impl<'a, 'b: 'a> Iterator for UnquoteIterator<'a, 'b> {
|
|
type Item = Result<u8, UnquoteError>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let raw = self.data.as_bytes();
|
|
|
|
if raw.is_empty() {
|
|
return self.err("empty input");
|
|
}
|
|
|
|
if 0 == self.pos {
|
|
// check for starting quote:
|
|
if raw[0] == b'"' {
|
|
self.quoted = true;
|
|
self.pos += 1;
|
|
}
|
|
}
|
|
|
|
if self.pos >= raw.len() {
|
|
if self.quoted {
|
|
return self.err("unexpected end of string");
|
|
} else {
|
|
*self.data = "";
|
|
return None;
|
|
}
|
|
}
|
|
if raw[self.pos] == b'"' {
|
|
if self.quoted {
|
|
// either followed by end-of-string or a whitespace
|
|
if self.pos + 1 < raw.len() && !is_ascii_whitespace(raw[self.pos + 1]) {
|
|
return self.err("quote in the middle of quoted string");
|
|
}
|
|
// eat terminating quote
|
|
// pos+1 is obviously a good utf-8 boundary
|
|
*self.data = self.data[self.pos + 1..].trim_start();
|
|
return None;
|
|
} else {
|
|
return self.err("quote in the middle of unquoted string");
|
|
}
|
|
} else if !self.quoted && is_ascii_whitespace(raw[self.pos]) {
|
|
// pos is obviously a good utf-8 boundary
|
|
*self.data = self.data[self.pos..].trim_start();
|
|
return None;
|
|
} else if raw[self.pos] == b'\\' {
|
|
if self.pos + 1 >= raw.len() {
|
|
return self.err("unexpected end of string after backslash");
|
|
}
|
|
if raw[self.pos + 1] < b'0' || raw[self.pos + 1] > b'9' {
|
|
let result = raw[self.pos + 1];
|
|
if !self.quoted && is_ascii_whitespace(result) {
|
|
return self.err("(escaped) whitespace not allowed in unquoted field");
|
|
}
|
|
self.pos += 2;
|
|
return Some(Ok(result));
|
|
}
|
|
// otherwise require 3 decimal digits
|
|
if self.pos + 3 >= raw.len() {
|
|
return self.err("unexpected end of string after backslash with decimal");
|
|
}
|
|
// raw[self.pos+1] already checked for digit above
|
|
if raw[self.pos + 2] < b'0'
|
|
|| raw[self.pos + 2] > b'9'
|
|
|| raw[self.pos + 3] < b'0'
|
|
|| raw[self.pos + 3] > b'9'
|
|
{
|
|
return self.err("expecting 3 digits after backslash with decimal");
|
|
}
|
|
let d1 = raw[self.pos + 1] - b'0';
|
|
let d2 = raw[self.pos + 2] - b'0';
|
|
let d3 = raw[self.pos + 3] - b'0';
|
|
let val = (d1 as u32 * 100) + (d2 as u32 * 10) + (d3 as u32);
|
|
if val > 255 {
|
|
return self.err("invalid decimal escape");
|
|
}
|
|
self.pos += 4;
|
|
Some(Ok(val as u8))
|
|
} else {
|
|
let result = raw[self.pos];
|
|
self.pos += 1;
|
|
Some(Ok(result))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::ser::text::{next_quoted_field, quote};
|
|
|
|
fn check_quote(data: &[u8], quoted: &str) {
|
|
assert_eq!(quote(data), quoted);
|
|
}
|
|
|
|
fn check_unquote(mut input: &str, data: &[u8]) {
|
|
assert_eq!(next_quoted_field(&mut input).unwrap(), data);
|
|
assert!(input.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_escapes() {
|
|
check_quote(b"\"hello \\ \xc3\xa4", r#""\"hello \\ \195\164""#);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parser() {
|
|
check_unquote(r#""\"hello \\ \195\164""#, b"\"hello \\ \xc3\xa4");
|
|
check_unquote(r#" "\"hello \\ \195\164" "#, b"\"hello \\ \xc3\xa4");
|
|
check_unquote(r#""\"hello \\ ä""#, b"\"hello \\ \xc3\xa4");
|
|
check_unquote(r#" "\"hello \\ ä" "#, b"\"hello \\ \xc3\xa4");
|
|
// unquoted input
|
|
check_unquote(r#"foobarä"#, b"foobar\xc3\xa4");
|
|
check_unquote(r#"foobar\195\164"#, b"foobar\xc3\xa4");
|
|
check_unquote(r#" foobarä "#, b"foobar\xc3\xa4");
|
|
// random (unnecessary) escapes:
|
|
check_unquote(r#" "\x\%\@\." "#, b"x%@.");
|
|
}
|
|
}
|