diff options
author | Jeremiah Senkpiel <fishrock123@rocketmail.com> | 2022-07-09 18:02:44 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-09 20:02:44 -0500 |
commit | f26bdc9109abffc86ea78bb85cf3504a771d17ad (patch) | |
tree | dbd543924da72c713578f8cb28e6facebcf04e5d | |
parent | 3ddcd171ea66efa94821533299f48788a18fb33a (diff) |
feat: non-strict mode replaces invalid UTF-8 (#62)
This change makes non-strict mode percent-decode utf-8 lossily.
This means that invalid UTF-8 percent encoded sequences, such as `%E9`, are
decoded as the unicode replacement character (� `U+FFFD`).
Refs: https://github.com/samscott89/serde_qs/issues/43
-rw-r--r-- | src/de/parse.rs | 10 | ||||
-rw-r--r-- | src/lib.rs | 19 | ||||
-rw-r--r-- | tests/test_deserialize.rs | 13 |
3 files changed, 38 insertions, 4 deletions
diff --git a/src/de/parse.rs b/src/de/parse.rs index 5c85b12..fe14bb4 100644 --- a/src/de/parse.rs +++ b/src/de/parse.rs @@ -242,8 +242,16 @@ impl<'a> Parser<'a> { /// present. fn collect_str(&mut self) -> Result<Cow<'a, str>> { let replaced = replace_plus(&self.inner[self.acc.0..self.acc.1 - 1]); + let decoder = percent_encoding::percent_decode(&replaced); + + let maybe_decoded = if self.strict { + decoder.decode_utf8()? + } else { + decoder.decode_utf8_lossy() + }; + let ret: Result<Cow<'a, str>> = - match percent_encoding::percent_decode(&replaced).decode_utf8()? { + match maybe_decoded { Cow::Borrowed(_) => { match replaced { Cow::Borrowed(_) => { @@ -75,9 +75,14 @@ //! ## Strict vs Non-Strict modes //! //! `serde_qs` supports two operating modes, which can be specified using -//! [`Config`](struct.Config.html), and is all about how `serde_qs` handles square brackets. -//! -//! Techncially, square brackets should be encoded in URLs as `%5B` and `%5D`. +//! [`Config`](struct.Config.html). +//! Strict mode has two parts: +//! - how `serde_qs` handles square brackets +//! - how `serde_qs` handles invalid UTF-8 percent decoded characters +//! +//! ### Square Brackets +//! +//! Technically, square brackets should be encoded in URLs as `%5B` and `%5D`. //! However, they are often used in their raw format to specify querystrings //! such as `a[b]=123`. //! @@ -96,6 +101,14 @@ //! automatically encodes the brackets. But care must be taken to avoid //! using keys with square brackets in them, or unexpected things can //! happen. +//! +//! ### Invalid UTF-8 Percent Encodings +//! +//! Sometimes querystrings may have percent-encoded data which does not decode +//! to UTF-8. In some cases it is useful for this to cause errors, which is how +//! `serde_qs` works in strict mode (the default). Whereas in other cases it +//! can be useful to just replace such data with the unicode replacement +//! character (� `U+FFFD`), which is how `serde_qs` works in non-strict mode. //! //! ## Flatten workaround //! diff --git a/tests/test_deserialize.rs b/tests/test_deserialize.rs index 40cd43b..92e5f53 100644 --- a/tests/test_deserialize.rs +++ b/tests/test_deserialize.rs @@ -560,6 +560,19 @@ fn strict_mode() { .deserialize_str("vec%5B%5D=1&vec%5B%5D=2") .unwrap(); assert_eq!(params.vec, vec![1, 2]); + + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct StringQueryParam { + field: String, + } + + // Ensure strict mode produces an error for invalid UTF-8 percent encoded characters. + let invalid_utf8: Result<StringQueryParam, _> = strict_config.deserialize_str("field=%E9"); + assert!(invalid_utf8.is_err()); + + // Ensure loose mode invalid UTF-8 percent encoded characters become � U+FFFD. + let valid_utf8: StringQueryParam = loose_config.deserialize_str("field=%E9").unwrap(); + assert_eq!(valid_utf8.field, "�"); } #[test] |