From 0aaf9e1b533c0d4f1e7c257d7d7d1267c26fcf6b Mon Sep 17 00:00:00 2001 From: Matt Ellis Date: Thu, 18 Jun 2026 11:20:02 -0700 Subject: [PATCH] Recover JSON-RPC messages with malformed unicode escapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Rust SDK's JSON-RPC read loop parsed each framed body with a strict serde_json::from_slice. A response containing a malformed `\u` escape — notably a lone (unpaired) UTF-16 surrogate — failed with "unexpected end of hex escape", which aborted JsonRpcClient::read_loop, drained all pending requests, and tore down the transport. The channel then reconnected and re-broke on the same deterministic bytes, so every RPC on it (list_models, get_account_quota, list_global_agents, list_global_skills) failed together (github/app#1055). Add a string-context-aware sanitizer (surrogate_safe::sanitize_json_escapes) that rewrites lone surrogates and otherwise-malformed `\u` escapes to U+FFFD while preserving valid surrogate pairs and escaped backslashes. It runs only on the parse-error path, so well-formed traffic is unaffected and zero-allocation (Cow::Borrowed) when nothing needs repair. On recovery the reader warns with the replacement count and re-parses; unrelated syntax errors still propagate unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- rust/src/jsonrpc.rs | 25 ++- rust/src/lib.rs | 3 + rust/src/surrogate_safe.rs | 305 +++++++++++++++++++++++++++++++++++++ rust/tests/jsonrpc_test.rs | 50 ++++++ 4 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 rust/src/surrogate_safe.rs diff --git a/rust/src/jsonrpc.rs b/rust/src/jsonrpc.rs index fbdc96505..288d1b9a0 100644 --- a/rust/src/jsonrpc.rs +++ b/rust/src/jsonrpc.rs @@ -428,7 +428,30 @@ impl JsonRpcClient { let mut body = vec![0u8; length]; reader.read_exact(&mut body).await?; - let message: JsonRpcMessage = serde_json::from_slice(&body)?; + let message: JsonRpcMessage = match serde_json::from_slice(&body) { + Ok(message) => message, + Err(parse_error) => { + // A strict parse failure is often a lone UTF-16 surrogate + // (or other malformed `\u` escape) emitted by a lenient + // upstream serializer. Rather than killing the read loop — + // which tears down the transport and retries forever on the + // same bytes (github/app#1055) — rewrite the bad escapes to + // U+FFFD and try once more. The sanitizer is a no-op for any + // other syntax error, so unrelated failures still propagate. + match crate::surrogate_safe::sanitize_json_escapes(&body) { + (std::borrow::Cow::Owned(sanitized), replacements) => { + warn!( + replacements, + original_error = %parse_error, + "recovered JSON-RPC message containing malformed unicode escapes by \ + substituting U+FFFD" + ); + serde_json::from_slice(&sanitized)? + } + (std::borrow::Cow::Borrowed(_), _) => return Err(parse_error.into()), + } + } + }; Ok(Some(message)) } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index c04fe19e1..040311637 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -41,6 +41,9 @@ mod wire; /// Session event payload types — auto-generated from the protocol schema. pub mod session_events; +/// Recovery for JSON-RPC payloads carrying malformed `\u` escapes. +mod surrogate_safe; + /// JSON-RPC request/response types and typed namespace builders for /// [`Client::rpc`] and [`session::Session::rpc`](crate::session::Session::rpc). pub mod rpc; diff --git a/rust/src/surrogate_safe.rs b/rust/src/surrogate_safe.rs new file mode 100644 index 000000000..91176c9b7 --- /dev/null +++ b/rust/src/surrogate_safe.rs @@ -0,0 +1,305 @@ +//! Recovery for JSON payloads that carry malformed `\u` escapes. +//! +//! `serde_json` is strict per RFC 8259: a lone UTF-16 surrogate (or any +//! `\u` escape that is not followed by four hex digits) is a hard parse +//! error. When such a payload arrives on the JSON-RPC stream — for +//! example a backend response that was serialized by a lenient producer, +//! or a session-resume frame truncated mid-emoji — the strict parse +//! aborts the whole read loop and tears down the transport, which then +//! retries forever on the same bad bytes (see github/app#1055). +//! +//! [`sanitize_json_escapes`] rewrites the offending escapes to the +//! Unicode replacement character (`\ufffd`) so the message can be parsed +//! and delivered instead of killing the connection. It is invoked only on +//! the parse-error path, so well-formed traffic pays nothing. + +use std::borrow::Cow; + +/// The JSON escape for U+FFFD REPLACEMENT CHARACTER, substituted for any +/// malformed or unpaired `\u` escape. +const REPLACEMENT_ESCAPE: &[u8] = b"\\ufffd"; + +#[inline] +fn hex_value(byte: u8) -> u16 { + match byte { + b'0'..=b'9' => u16::from(byte - b'0'), + b'a'..=b'f' => u16::from(byte - b'a' + 10), + b'A'..=b'F' => u16::from(byte - b'A' + 10), + _ => unreachable!("caller guarantees an ASCII hex digit"), + } +} + +/// Decode a `\uXXXX` escape that begins at `input[i]` (which the caller +/// guarantees is `\\` immediately followed by `u`). Returns the 16-bit +/// code unit when four hex digits follow, otherwise `None`. +fn decode_unicode_escape(input: &[u8], i: usize) -> Option { + let digits = input.get(i + 2..i + 6)?; + if digits.iter().all(u8::is_ascii_hexdigit) { + Some( + (hex_value(digits[0]) << 12) + | (hex_value(digits[1]) << 8) + | (hex_value(digits[2]) << 4) + | hex_value(digits[3]), + ) + } else { + None + } +} + +#[inline] +fn is_high_surrogate(unit: u16) -> bool { + (0xD800..=0xDBFF).contains(&unit) +} + +#[inline] +fn is_low_surrogate(unit: u16) -> bool { + (0xDC00..=0xDFFF).contains(&unit) +} + +/// Rewrite lone UTF-16 surrogates and otherwise-malformed `\u` escapes in +/// a UTF-8 JSON document to `\ufffd`, leaving every other byte untouched. +/// +/// Returns [`Cow::Borrowed`] (and a count of `0`) when the input is +/// already strict-JSON safe, so the common case allocates nothing. The +/// returned count is the number of escapes that were replaced. +/// +/// The scan tracks JSON string context so an escaped backslash (`\\u…`, +/// where the `u` is literal text, not an escape) is never misinterpreted +/// as a unicode escape. Bytes outside strings — structure, numbers, raw +/// multi-byte UTF-8 — pass through verbatim. +pub(crate) fn sanitize_json_escapes(input: &[u8]) -> (Cow<'_, [u8]>, usize) { + // Lone-surrogate and malformed escapes only exist behind a backslash; + // without one there is nothing to repair. + if !input.contains(&b'\\') { + return (Cow::Borrowed(input), 0); + } + + let mut out: Option> = None; + let mut replacements = 0usize; + let mut in_string = false; + let mut i = 0; + let len = input.len(); + + // Allocate the owned buffer lazily, seeded with the verbatim prefix we + // had been borrowing, the first time a byte range diverges from the + // input. + macro_rules! diverge { + () => { + out.get_or_insert_with(|| input[..i].to_vec()) + }; + } + macro_rules! copy { + ($range:expr) => { + if let Some(buf) = out.as_mut() { + buf.extend_from_slice($range); + } + }; + } + + while i < len { + let byte = input[i]; + + if !in_string { + if byte == b'"' { + in_string = true; + } + copy!(&input[i..=i]); + i += 1; + continue; + } + + match byte { + b'"' => { + in_string = false; + copy!(&input[i..=i]); + i += 1; + } + b'\\' => match input.get(i + 1) { + // A `\u…` escape: inspect it for surrogate validity. + Some(b'u') => match decode_unicode_escape(input, i) { + Some(unit) if is_high_surrogate(unit) => { + let low_start = i + 6; + let paired = input.get(low_start) == Some(&b'\\') + && input.get(low_start + 1) == Some(&b'u') + && decode_unicode_escape(input, low_start) + .is_some_and(is_low_surrogate); + if paired { + copy!(&input[i..i + 12]); + i += 12; + } else { + diverge!().extend_from_slice(REPLACEMENT_ESCAPE); + replacements += 1; + i += 6; + } + } + Some(unit) if is_low_surrogate(unit) => { + diverge!().extend_from_slice(REPLACEMENT_ESCAPE); + replacements += 1; + i += 6; + } + Some(_) => { + copy!(&input[i..i + 6]); + i += 6; + } + None => { + // `\u` not followed by four hex digits. Emit one + // replacement and consume the backslash, the `u`, + // and any partial hex run so the leftover digits + // don't read as a second broken escape. + diverge!().extend_from_slice(REPLACEMENT_ESCAPE); + replacements += 1; + let mut j = i + 2; + while j < len && j < i + 6 && input[j].is_ascii_hexdigit() { + j += 1; + } + i = j; + } + }, + // Any other escape (`\"`, `\\`, `\n`, …) is two bytes that + // pass through untouched; consuming both also prevents an + // escaped backslash from masking the following character. + Some(_) => { + copy!(&input[i..i + 2]); + i += 2; + } + // Trailing lone backslash at end of buffer. + None => { + copy!(&input[i..=i]); + i += 1; + } + }, + _ => { + copy!(&input[i..=i]); + i += 1; + } + } + } + + match out { + Some(buf) => (Cow::Owned(buf), replacements), + None => (Cow::Borrowed(input), replacements), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sanitized_str(input: &str) -> (String, usize) { + let (bytes, count) = sanitize_json_escapes(input.as_bytes()); + (String::from_utf8(bytes.into_owned()).unwrap(), count) + } + + #[test] + fn clean_payload_is_borrowed_without_allocation() { + let input = br#"{"a":"hello \u00e9 \ud83d\ude00 world","b":[1,2,3]}"#; + let (out, count) = sanitize_json_escapes(input); + assert!(matches!(out, Cow::Borrowed(_))); + assert_eq!(count, 0); + assert_eq!(out.as_ref(), input); + } + + #[test] + fn payload_without_backslash_is_borrowed() { + let input = br#"{"a":"plain text","n":42}"#; + let (out, count) = sanitize_json_escapes(input); + assert!(matches!(out, Cow::Borrowed(_))); + assert_eq!(count, 0); + } + + #[test] + fn lone_high_surrogate_becomes_replacement() { + let (out, count) = sanitized_str(r#"{"a":"x\ud83d y"}"#); + assert_eq!(out, r#"{"a":"x\ufffd y"}"#); + assert_eq!(count, 1); + // The repaired text must now parse strictly. + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert_eq!(value["a"], "x\u{fffd} y"); + } + + #[test] + fn lone_low_surrogate_becomes_replacement() { + let (out, count) = sanitized_str(r#"{"a":"x\udd11 y"}"#); + assert_eq!(out, r#"{"a":"x\ufffd y"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn valid_surrogate_pair_is_preserved() { + let (out, count) = sanitized_str(r#"{"a":"key \ud83d\udd11 done"}"#); + assert_eq!(out, r#"{"a":"key \ud83d\udd11 done"}"#); + assert_eq!(count, 0); + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert_eq!(value["a"], "key \u{1f511} done"); + } + + #[test] + fn high_surrogate_followed_by_bmp_escape_is_repaired() { + // `\ud83d` (high) followed by a non-low escape `\u0041` ('A'). + let (out, count) = sanitized_str(r#"{"a":"\ud83d\u0041"}"#); + assert_eq!(out, r#"{"a":"\ufffd\u0041"}"#); + assert_eq!(count, 1); + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert_eq!(value["a"], "\u{fffd}A"); + } + + #[test] + fn high_surrogate_at_end_of_string_is_repaired() { + let (out, count) = sanitized_str(r#"{"a":"tail\ud83d"}"#); + assert_eq!(out, r#"{"a":"tail\ufffd"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn malformed_hex_escape_is_repaired() { + let (out, count) = sanitized_str(r#"{"a":"x\uZZZZ y"}"#); + assert_eq!(out, r#"{"a":"x\ufffdZZZZ y"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn truncated_hex_escape_before_quote_is_repaired() { + let (out, count) = sanitized_str(r#"{"a":"x\ud8"}"#); + assert_eq!(out, r#"{"a":"x\ufffd"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn escaped_backslash_before_u_is_not_an_escape() { + // `\\u d83d` is a literal backslash followed by the text "ud83d"; + // it must be left alone, not treated as a surrogate. + let (out, count) = sanitized_str(r#"{"a":"\\ud83d"}"#); + assert_eq!(out, r#"{"a":"\\ud83d"}"#); + assert_eq!(count, 0); + } + + #[test] + fn surrogate_like_bytes_outside_strings_are_untouched() { + // A key that itself contains a valid escape plus a lone surrogate + // in the value, to confirm string boundaries are tracked. + let (out, count) = sanitized_str(r#"{"k\ud83d":"v"}"#); + assert_eq!(out, r#"{"k\ufffd":"v"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn multiple_lone_surrogates_are_all_replaced() { + let (out, count) = sanitized_str(r#"{"a":"\ud83d\ud83d","b":"\udc00"}"#); + assert_eq!(out, r#"{"a":"\ufffd\ufffd","b":"\ufffd"}"#); + assert_eq!(count, 3); + serde_json::from_str::(&out).unwrap(); + } + + #[test] + fn raw_multibyte_utf8_in_string_is_preserved() { + let (out, count) = sanitized_str(r#"{"a":"café 🚀 \ud83d"}"#); + assert_eq!(out, r#"{"a":"café 🚀 \ufffd"}"#); + assert_eq!(count, 1); + serde_json::from_str::(&out).unwrap(); + } +} diff --git a/rust/tests/jsonrpc_test.rs b/rust/tests/jsonrpc_test.rs index 7f7d43213..3ed78f018 100644 --- a/rust/tests/jsonrpc_test.rs +++ b/rust/tests/jsonrpc_test.rs @@ -212,6 +212,56 @@ async fn read_loop_terminates_on_eof() { tokio::time::sleep(std::time::Duration::from_millis(100)).await; } +/// Regression for github/app#1055: a JSON-RPC message carrying a lone +/// UTF-16 surrogate (or other malformed `\u` escape) must not kill the +/// read loop. The bad escape is rewritten to U+FFFD, the message is +/// delivered, and subsequent messages on the same channel still arrive. +#[tokio::test] +async fn malformed_unicode_escape_is_recovered_and_loop_survives() { + let (_client_write, _discard) = duplex(4096); + let (mut server_write, client_read) = duplex(4096); + + let (notification_tx, mut notification_rx) = broadcast::channel(16); + let (request_tx, _request_rx) = mpsc::unbounded_channel(); + + let _client = JsonRpcClient::new(_client_write, client_read, notification_tx, request_tx); + + // A lone high surrogate `\ud83d` in the params — strict serde_json + // rejects this with "unexpected end of hex escape". Construct the raw + // body by hand since serde would never emit a lone surrogate. + let bad_body = + br#"{"jsonrpc":"2.0","method":"session.event","params":{"text":"oops \ud83d here"}}"#; + write_framed(&mut server_write, bad_body).await; + + let recovered: JsonRpcNotification = + tokio::time::timeout(std::time::Duration::from_secs(2), notification_rx.recv()) + .await + .expect("timed out waiting for recovered notification") + .unwrap(); + assert_eq!(recovered.method, "session.event"); + assert_eq!( + recovered.params.unwrap()["text"], + "oops \u{fffd} here", + "lone surrogate should be replaced with U+FFFD" + ); + + // The channel must still be alive: a following well-formed message + // arrives normally rather than the loop having torn down the transport. + let next = serde_json::json!({ + "jsonrpc": "2.0", + "method": "session.event", + "params": { "text": "still here" } + }); + write_framed(&mut server_write, &serde_json::to_vec(&next).unwrap()).await; + + let received: JsonRpcNotification = + tokio::time::timeout(std::time::Duration::from_secs(2), notification_rx.recv()) + .await + .expect("timed out waiting for follow-up notification") + .unwrap(); + assert_eq!(received.params.unwrap()["text"], "still here"); +} + /// Cancel-safety regression: dropping a `write()` future after the actor has /// committed to writing must NOT produce a partial frame on the wire. ///