use std::fmt; pub struct EncodedByte { storage: [u8; 4], // max: `\000` used: u8, } impl ::std::ops::Deref for EncodedByte { type Target = str; fn deref(&self) -> &Self::Target { ::unsafe_ops::from_utf8_unchecked(&self.storage[..self.used as usize]) } } pub struct EncodeIterator<'a> { encode_whitespace: bool, data: &'a [u8] } impl<'a> EncodeIterator<'a> { pub fn new_quoted(value: &'a [u8]) -> Self { EncodeIterator{ encode_whitespace: false, data: value, } } pub fn new_encode_whitespace(value: &'a [u8]) -> Self { EncodeIterator{ encode_whitespace: true, data: value, } } } impl<'a> Iterator for EncodeIterator<'a> { type Item = EncodedByte; fn next(&mut self) -> Option { if self.data.is_empty() { return None; } let b = self.data[0]; self.data = &self.data[1..]; if b < 32 || b > 127 || (self.encode_whitespace && is_ascii_whitespace(b)) { // `\ddd` let d1 = b / 100; let d2 = (b / 10) % 10; let d3 = b % 10; Some(EncodedByte{ storage: [b'\\', b'0' + d1, b'0' + d2, b'0' + d3], used: 4, }) } else if b == b'"' || b == b'\\' { // `\c` Some(EncodedByte{ storage: [b'\\', b, 0, 0], used: 2, }) } else { Some(EncodedByte{ storage: [b, 0, 0, 0], used: 1, }) } } } #[derive(Debug)] pub struct UnquoteError { data: String, position: usize, msg: &'static str, } impl fmt::Display for UnquoteError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "unquote error at position {} in {:?}: {}", self.position, self.data, self.msg) } } impl ::failure::Fail for UnquoteError {} pub struct UnquoteIterator<'a, 'b: 'a> { quoted: bool, data: &'a mut &'b str, pos: usize, } /// when walked to end without hitting errors between, the terminating /// `"` and following whitespace will be removed from `*data`. impl<'a, 'b: 'a> UnquoteIterator<'a, 'b> { pub fn new(data: &'a mut &'b str) -> Self { UnquoteIterator { quoted: false, data: data, pos: 0, } } fn err(&mut self, msg: &'static str) -> Option> { Some(Err(UnquoteError{ data: (*self.data).into(), position: self.pos, msg: msg, })) } } pub(crate) fn is_ascii_whitespace(c: u8) -> bool { match c { 0x09 => true, // horizontal tab: \t 0x0a => true, // line feed: \n 0x0c => true, // form feed: \f 0x0d => true, // form feed: \r 0x20 => true, // space: ' ' _ => false, } } impl<'a, 'b: 'a> Iterator for UnquoteIterator<'a, 'b> { type Item = Result; fn next(&mut self) -> Option { let raw = self.data.as_bytes(); if raw.is_empty() { return self.err("empty input"); } if 0 == self.pos { // check for starting quote: if raw[0] == b'"' { self.quoted = true; self.pos += 1; } } if self.pos >= raw.len() { if self.quoted { return self.err("unexpected end of string"); } else { *self.data = ""; return None; } } if raw[self.pos] == b'"' { if self.quoted { // either followed by end-of-string or a whitespace if self.pos+1 < raw.len() && !is_ascii_whitespace(raw[self.pos+1]) { return self.err("quote in the middle of quoted string"); } // eat terminating quote // pos+1 is obviously a good utf-8 boundary *self.data = self.data[self.pos+1..].trim_left(); return None; } else { return self.err("quote in the middle of unquoted string"); } } else if !self.quoted && is_ascii_whitespace(raw[self.pos]) { // pos is obviously a good utf-8 boundary *self.data = self.data[self.pos..].trim_left(); return None; } else if raw[self.pos] == b'\\' { if self.pos + 1 >= raw.len() { return self.err("unexpected end of string after backslash"); } if raw[self.pos+1] < b'0' || raw[self.pos+1] > b'9' { let result = raw[self.pos+1]; if !self.quoted && is_ascii_whitespace(result) { return self.err("(escaped) whitespace not allowed in unquoted field"); } self.pos += 2; return Some(Ok(result)); } // otherwise require 3 decimal digits if self.pos + 3 >= raw.len() { return self.err("unexpected end of string after backslash with decimal"); } // raw[self.pos+1] already checked for digit above if raw[self.pos+2] < b'0' || raw[self.pos+2] > b'9' || raw[self.pos+3] < b'0' || raw[self.pos+3] > b'9' { return self.err("expecting 3 digits after backslash with decimal"); } let d1 = raw[self.pos+1] - b'0'; let d2 = raw[self.pos+2] - b'0'; let d3 = raw[self.pos+3] - b'0'; let val = (d1 as u32 * 100) + (d2 as u32 * 10) + (d3 as u32); if val > 255 { return self.err("invalid decimal escape"); } self.pos += 4; Some(Ok(val as u8)) } else { let result = raw[self.pos]; self.pos += 1; Some(Ok(result)) } } } #[cfg(test)] mod tests { use ser::text::{next_quoted_field, quote}; fn check_quote(data: &[u8], quoted: &str) { assert_eq!( quote(data), quoted ); } fn check_unquote(mut input: &str, data: &[u8]) { assert_eq!( next_quoted_field(&mut input).unwrap(), data ); assert!(input.is_empty()); } #[test] fn test_escapes() { check_quote(b"\"hello \\ \xc3\xa4", r#""\"hello \\ \195\164""#); } #[test] fn test_parser() { check_unquote(r#""\"hello \\ \195\164""#, b"\"hello \\ \xc3\xa4"); check_unquote(r#" "\"hello \\ \195\164" "#, b"\"hello \\ \xc3\xa4"); check_unquote(r#""\"hello \\ ä""#, b"\"hello \\ \xc3\xa4"); check_unquote(r#" "\"hello \\ ä" "#, b"\"hello \\ \xc3\xa4"); // unquoted input check_unquote(r#"foobarä"#, b"foobar\xc3\xa4"); check_unquote(r#"foobar\195\164"#, b"foobar\xc3\xa4"); check_unquote(r#" foobarä "#, b"foobar\xc3\xa4"); // random (unnecessary) escapes: check_unquote(r#" "\x\%\@\." "#, b"x%@."); } }