Skip to content

Commit

Permalink
Fix #91, when a key is missing a right delimeter (but not the left on…
Browse files Browse the repository at this point in the history
…e) there could be cases in which this can be repaired safely.
  • Loading branch information
mangiucugna committed Dec 31, 2024
1 parent 465e568 commit 525f9eb
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "json_repair"
version = "0.34.0"
version = "0.35.0"
license = {file = "LICENSE"}
authors = [
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
Expand Down
32 changes: 32 additions & 0 deletions src/json_repair/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,38 @@ def parse_string(self) -> Union[str, bool, None]:
string_acc += escape_seqs.get(char, char) or char
self.index += 1
char = self.get_char_at()
# If we are in object key context and we find a colon, it could be a missing right quote
if (
char == ":"
and not missing_quotes
and self.context.current == ContextValues.OBJECT_KEY
):
# Ok now we need to check if this is followed by a value like "..."
i = self.skip_to_character(character=lstring_delimiter, idx=1)
next_c = self.get_char_at(i)
if next_c:
i += 1
# found the first delimiter
i = self.skip_to_character(character=rstring_delimiter, idx=i)
next_c = self.get_char_at(i)
if next_c:
# found a second delimiter
i += 1
# Skip spaces
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
next_c = self.get_char_at(i)
if next_c and next_c in [",", "}"]:
# Ok then this is a missing right quote
self.log(
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
)
break
else:
# The string ended without finding a lstring_delimiter, I will assume this is a missing right quote
self.log(
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
)
break
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
if char == rstring_delimiter:
# Special case here, in case of double quotes one after another
Expand Down
2 changes: 2 additions & 0 deletions tests/test_json_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def test_object_edge_cases():
assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}'
assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}'
assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}'
assert repair_json('{"key:"value"}') == '{"key": "value"}'
assert repair_json('{"key:value}') == '{"key": "value"}'
assert repair_json('[{"lorem": {"ipsum": "sic"}, """" "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, {"lorem": {"ipsum": "sic"}}]'

def test_number_edge_cases():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_false_false_incorrect(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 1.9 / 10 ** 3 # 1.9 millisecond
max_time = 2 / 10 ** 3 # 2 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"

0 comments on commit 525f9eb

Please sign in to comment.