feat: non-strict mode replaces invalid UTF-8 (#62)

This change makes non-strict mode percent-decode utf-8 lossily. This means that invalid UTF-8 percent encoded sequences, such as `%E9`, are decoded as the unicode replacement character (� `U+FFFD`). Refs: https://github.com/samscott89/serde_qs/issues/43
author: Jeremiah Senkpiel <fishrock123@rocketmail.com> 2022-07-09 18:02:44 -0700
committer: GitHub <noreply@github.com> 2022-07-09 20:02:44 -0500
commit: f26bdc9109abffc86ea78bb85cf3504a771d17ad (patch)
tree: dbd543924da72c713578f8cb28e6facebcf04e5d
parent: 3ddcd171ea66efa94821533299f48788a18fb33a (diff)
3 files changed, 38 insertions, 4 deletions
diff --git a/src/de/parse.rs b/src/de/parse.rs
index 5c85b12..fe14bb4 100644
--- a/src/de/parse.rs
+++ b/src/de/parse.rs
@@ -242,8 +242,16 @@ impl<'a> Parser<'a> {
     /// present.
     fn collect_str(&mut self) -> Result<Cow<'a, str>> {
         let replaced = replace_plus(&self.inner[self.acc.0..self.acc.1 - 1]);
+        let decoder = percent_encoding::percent_decode(&replaced);
+
+        let maybe_decoded = if self.strict {
+            decoder.decode_utf8()?
+        } else {
+            decoder.decode_utf8_lossy()
+        };
+
         let ret: Result<Cow<'a, str>> =
-            match percent_encoding::percent_decode(&replaced).decode_utf8()? {
+            match maybe_decoded {
                 Cow::Borrowed(_) => {
                     match replaced {
                         Cow::Borrowed(_) => {
diff --git a/src/lib.rs b/src/lib.rs
index 96ce141..9df0f15 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -75,9 +75,14 @@
 //! ## Strict vs Non-Strict modes
 //!
 //! `serde_qs` supports two operating modes, which can be specified using
-//! [`Config`](struct.Config.html), and is all about how `serde_qs` handles square brackets.
-//!
-//! Techncially, square brackets should be encoded in URLs as `%5B` and `%5D`.
+//! [`Config`](struct.Config.html).
+//! Strict mode has two parts:
+//! - how `serde_qs` handles square brackets
+//! - how `serde_qs` handles invalid UTF-8 percent decoded characters
+//!
+//! ### Square Brackets
+//! 
+//! Technically, square brackets should be encoded in URLs as `%5B` and `%5D`.
 //! However, they are often used in their raw format to specify querystrings
 //! such as `a[b]=123`.
 //!
@@ -96,6 +101,14 @@
 //! automatically encodes the brackets. But care must be taken to avoid
 //! using keys with square brackets in them, or unexpected things can
 //! happen.
+//! 
+//! ### Invalid UTF-8 Percent Encodings
+//! 
+//! Sometimes querystrings may have percent-encoded data which does not decode
+//! to UTF-8. In some cases it is useful for this to cause errors, which is how
+//! `serde_qs` works in strict mode (the default). Whereas in other cases it
+//! can be useful to just replace such data with the unicode replacement
+//! character (� `U+FFFD`), which is how `serde_qs` works in non-strict mode.
 //!
 //! ## Flatten workaround
 //!
diff --git a/tests/test_deserialize.rs b/tests/test_deserialize.rs
index 40cd43b..92e5f53 100644
--- a/tests/test_deserialize.rs
+++ b/tests/test_deserialize.rs
@@ -560,6 +560,19 @@ fn strict_mode() {
         .deserialize_str("vec%5B%5D=1&vec%5B%5D=2")
         .unwrap();
     assert_eq!(params.vec, vec![1, 2]);
+
+    #[derive(Debug, Serialize, Deserialize, PartialEq)]
+    struct StringQueryParam {
+        field: String,
+    }
+
+    // Ensure strict mode produces an error for invalid UTF-8 percent encoded characters.
+    let invalid_utf8: Result<StringQueryParam, _> = strict_config.deserialize_str("field=%E9");
+    assert!(invalid_utf8.is_err());
+
+    // Ensure loose mode invalid UTF-8 percent encoded characters become � U+FFFD.
+    let valid_utf8: StringQueryParam = loose_config.deserialize_str("field=%E9").unwrap();
+    assert_eq!(valid_utf8.field, "�");
 }
 
 #[test]
author	Jeremiah Senkpiel <fishrock123@rocketmail.com>	2022-07-09 18:02:44 -0700
committer	GitHub <noreply@github.com>	2022-07-09 20:02:44 -0500
commit	f26bdc9109abffc86ea78bb85cf3504a771d17ad (patch)
tree	dbd543924da72c713578f8cb28e6facebcf04e5d
parent	3ddcd171ea66efa94821533299f48788a18fb33a (diff)