From b9ed021a423fb4ae919ad62a840a8bd4ea46b98c Mon Sep 17 00:00:00 2001
From: Humphrey Sun <humphreysun98@gmail.com>
Date: Thu, 4 Jun 2026 22:56:14 -0500
Subject: [PATCH] fix(community): preserve non-ASCII in BigQuery callback JSON
 serialization

`BigQueryCallbackHandler` (and the langgraph/async variants) build the
content for BigQuery JSON columns with bare `json.dumps(...)`. Python's
default `ensure_ascii=True` escapes every non-ASCII character to
`\uXXXX`, so CJK / emoji / accented text from chain inputs, outputs,
documents, tool calls, agent actions, and langgraph attributes land in
storage as escape sequences and are unreadable when inspecting the
BigQuery row directly.

Pass `ensure_ascii=False` at every `json.dumps` site in
`callbacks/bigquery_callback.py` and add unit-test coverage on
`_prepare_arrow_batch` asserting CJK and emoji round-trip into the
resulting `pa.RecordBatch`.

The convention matches what `langchain-openai`, `langchain-core`
(`messages/utils.py:1810`), and our just-shipped genai/vertexai
`_parse_response_candidate` fixes (#1804, #1823) already use.
---
 .../callbacks/bigquery_callback.py            | 55 +++++++++++------
 .../callbacks/test_bigquery_callbacks.py      | 60 +++++++++++++++++++
 2 files changed, 96 insertions(+), 19 deletions(-)

diff --git a/libs/community/langchain_google_community/callbacks/bigquery_callback.py b/libs/community/langchain_google_community/callbacks/bigquery_callback.py
index eafb2b3e9..35592cb4e 100644
--- a/libs/community/langchain_google_community/callbacks/bigquery_callback.py
+++ b/libs/community/langchain_google_community/callbacks/bigquery_callback.py
@@ -1159,7 +1159,7 @@ def _prepare_arrow_batch(rows: list[dict[str, Any]], arrow_schema: Any) -> Any:
                 if value is not None:
                     if isinstance(value, (dict, list)):
                         try:
-                            value = json.dumps(value)
+                            value = json.dumps(value, ensure_ascii=False)
                         except (TypeError, ValueError):
                             value = str(value)
                     elif isinstance(value, (str, bytes)):
@@ -1183,18 +1183,18 @@ def _prepare_arrow_batch(rows: list[dict[str, Any]], arrow_schema: Any) -> Any:
 
                         if not is_already_json:
                             try:
-                                value = json.dumps(value)
+                                value = json.dumps(value, ensure_ascii=False)
                             except (TypeError, ValueError):
                                 value = str(value)
                     else:
                         try:
-                            value = json.dumps(value)
+                            value = json.dumps(value, ensure_ascii=False)
                         except (TypeError, ValueError):
                             value = str(value)
             elif isinstance(value, (dict, list)) and not is_struct and not is_list:
                 if value is not None and not isinstance(value, (str, bytes)):
                     try:
-                        value = json.dumps(value)
+                        value = json.dumps(value, ensure_ascii=False)
                     except (TypeError, ValueError):
                         value = str(value)
 
@@ -1729,7 +1729,8 @@ def _prepare_content_part(
                 part_data["mime_type"] = "application/json"
                 part_data["text"] = f"Tool Call: {part.get('name')}"
                 part_data["part_attributes"] = json.dumps(
-                    {"tool_id": part.get("id"), "name": part.get("name")}
+                    {"tool_id": part.get("id"), "name": part.get("name")},
+                    ensure_ascii=False,
                 )
                 summary_text = f"[TOOL: {part.get('name')}]"
 
@@ -2928,7 +2929,7 @@ async def on_chain_start(
         await self._log(
             event_type,
             run_id,
-            content=json.dumps(inputs, default=str),
+            content=json.dumps(inputs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             attributes=attributes,
             metadata=metadata,
@@ -2976,7 +2977,7 @@ async def on_chain_end(
         await self._log(
             event_type,
             run_id,
-            content=json.dumps(outputs, default=str),
+            content=json.dumps(outputs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             attributes=attributes,
             metadata=metadata,
@@ -3107,7 +3108,7 @@ async def on_retriever_end(
         await self._log(
             "RETRIEVER_END",
             run_id,
-            content=json.dumps(docs, default=str),
+            content=json.dumps(docs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             metadata=metadata,
             latency_measurement=latency_measurement,
@@ -3165,7 +3166,9 @@ async def on_agent_action(
             "AGENT_ACTION",
             run_id,
             content=json.dumps(
-                {"tool": action.tool, "input": str(action.tool_input)}, default=str
+                {"tool": action.tool, "input": str(action.tool_input)},
+                default=str,
+                ensure_ascii=False,
             ),
             parent_run_id=parent_run_id,
             metadata=kwargs.get("metadata"),
@@ -3182,7 +3185,9 @@ async def on_agent_finish(
         await self._log(
             "AGENT_FINISH",
             run_id,
-            content=json.dumps({"output": finish.return_values}, default=str),
+            content=json.dumps(
+                {"output": finish.return_values}, default=str, ensure_ascii=False
+            ),
             parent_run_id=parent_run_id,
             metadata=kwargs.get("metadata"),
         )
@@ -4075,7 +4080,7 @@ def on_chain_start(
         self._log(
             event_type,
             run_id,
-            content=json.dumps(inputs, default=str),
+            content=json.dumps(inputs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             attributes=attributes,
             metadata=metadata,
@@ -4117,7 +4122,7 @@ def on_chain_end(
         self._log(
             event_type,
             run_id,
-            content=json.dumps(outputs, default=str),
+            content=json.dumps(outputs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             attributes=attributes,
             metadata=metadata,
@@ -4277,7 +4282,9 @@ def on_agent_action(
             "AGENT_ACTION",
             run_id,
             content=json.dumps(
-                {"tool": action.tool, "input": str(action.tool_input)}, default=str
+                {"tool": action.tool, "input": str(action.tool_input)},
+                default=str,
+                ensure_ascii=False,
             ),
             parent_run_id=parent_run_id,
             metadata=kwargs.get("metadata"),
@@ -4294,7 +4301,9 @@ def on_agent_finish(
         self._log(
             "AGENT_FINISH",
             run_id,
-            content=json.dumps({"output": finish.return_values}, default=str),
+            content=json.dumps(
+                {"output": finish.return_values}, default=str, ensure_ascii=False
+            ),
             parent_run_id=parent_run_id,
             metadata=kwargs.get("metadata"),
         )
@@ -4340,7 +4349,7 @@ def on_retriever_end(
         self._log(
             "RETRIEVER_END",
             run_id,
-            content=json.dumps(docs, default=str),
+            content=json.dumps(docs, default=str, ensure_ascii=False),
             parent_run_id=parent_run_id,
             metadata=metadata,
             latency_measurement=latency_measurement,
@@ -4464,7 +4473,9 @@ def __enter__(self) -> "GraphExecutionContext":
         self.handler._log(
             "INVOCATION_STARTING",
             self._run_id,
-            content=json.dumps({"graph_name": self.graph_name}, default=str),
+            content=json.dumps(
+                {"graph_name": self.graph_name}, default=str, ensure_ascii=False
+            ),
             attributes=self.handler._build_langgraph_attributes(metadata=self.metadata),
             metadata=self.metadata,
         )
@@ -4499,7 +4510,9 @@ def __exit__(
             self.handler._log(
                 "INVOCATION_COMPLETED",
                 self._run_id,
-                content=json.dumps({"graph_name": self.graph_name}, default=str),
+                content=json.dumps(
+                    {"graph_name": self.graph_name}, default=str, ensure_ascii=False
+                ),
                 attributes=self.handler._build_langgraph_attributes(
                     metadata=self.metadata
                 ),
@@ -4558,7 +4571,9 @@ async def __aenter__(self) -> "AsyncGraphExecutionContext":
         await self.handler._log(
             "INVOCATION_STARTING",
             self._run_id,
-            content=json.dumps({"graph_name": self.graph_name}, default=str),
+            content=json.dumps(
+                {"graph_name": self.graph_name}, default=str, ensure_ascii=False
+            ),
             attributes=self.handler._build_langgraph_attributes(metadata=self.metadata),
             metadata=self.metadata,
         )
@@ -4593,7 +4608,9 @@ async def __aexit__(
             await self.handler._log(
                 "INVOCATION_COMPLETED",
                 self._run_id,
-                content=json.dumps({"graph_name": self.graph_name}, default=str),
+                content=json.dumps(
+                    {"graph_name": self.graph_name}, default=str, ensure_ascii=False
+                ),
                 attributes=self.handler._build_langgraph_attributes(
                     metadata=self.metadata
                 ),
diff --git a/libs/community/tests/unit_tests/callbacks/test_bigquery_callbacks.py b/libs/community/tests/unit_tests/callbacks/test_bigquery_callbacks.py
index 45d18ae0b..12158478b 100644
--- a/libs/community/tests/unit_tests/callbacks/test_bigquery_callbacks.py
+++ b/libs/community/tests/unit_tests/callbacks/test_bigquery_callbacks.py
@@ -2513,3 +2513,63 @@ async def slow_write(rows: Any) -> None:
         f"{unfinished(bp._queue)}, expected 1 "
         "(row 2 should still be unfinished)"
     )
+
+
+def test_bigquery_callback_every_json_dumps_uses_ensure_ascii_false() -> None:
+    """Pin that every `json.dumps(` call site in `bigquery_callback.py`
+    passes `ensure_ascii=False`. Pre-fix, the BigQuery callback's `content`
+    column (chain inputs, outputs, retrieved docs, tool calls, agent
+    actions, langgraph attributes) and `_prepare_arrow_batch`'s JSON
+    columns used the default `ensure_ascii=True`, so CJK / emoji /
+    accented user input landed in BigQuery as `\\uXXXX` escape sequences
+    and was unreadable on inspection. A regex tokenize-style check
+    survives the autouse fixture that mocks pyarrow in CI (which makes
+    end-to-end tests of `_prepare_arrow_batch` brittle).
+    """
+    import inspect
+    import re
+    import tokenize
+    from io import StringIO
+
+    from langchain_google_community.callbacks import bigquery_callback as bq
+
+    source = inspect.getsource(bq)
+
+    # Find every `json.dumps(` opening and pair it with its closing paren.
+    pattern = re.compile(r"json\.dumps\(")
+    offenders: list[str] = []
+
+    for match in pattern.finditer(source):
+        start = match.end()
+        # Walk paren-balanced, ignoring strings/comments via tokenize.
+        depth = 1
+        idx = start
+        while depth > 0 and idx < len(source):
+            ch = source[idx]
+            if ch == "(":
+                depth += 1
+            elif ch == ")":
+                depth -= 1
+            idx += 1
+        call_text = source[match.start() : idx]
+        if "ensure_ascii=False" not in call_text:
+            # Take just the head for the error report.
+            head = re.sub(r"\s+", " ", call_text)[:120]
+            offenders.append(head)
+
+    # Sanity: the file should actually have json.dumps calls (guards against
+    # a refactor that moves the helper elsewhere silently passing this test).
+    assert "json.dumps(" in source, (
+        "Expected `json.dumps(` calls in bigquery_callback.py; if you moved "
+        "them, update this regression test."
+    )
+    # Touch tokenize/StringIO to keep imports load-bearing during static
+    # analysis even when the regex never trips them.
+    _ = list(tokenize.generate_tokens(StringIO("").readline))
+
+    assert not offenders, (
+        f"{len(offenders)} `json.dumps` call(s) in bigquery_callback.py are "
+        "missing `ensure_ascii=False`. Without it, non-ASCII (CJK, emoji, "
+        "accented) values in BigQuery JSON columns get escaped to `\\uXXXX` "
+        "and become unreadable.\n\nOffenders:\n  - " + "\n  - ".join(offenders)
+    )