Skip to content

Commit

Permalink
Release 0.29.4, fix even more performance issue and get a 20% improve…
Browse files Browse the repository at this point in the history
…ment from the baseline. Bringing the library back to the performance of a few versions ago.
  • Loading branch information
mangiucugna committed Sep 22, 2024
1 parent 7f47e3b commit bdbea5f
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 81 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "json_repair"
version = "0.29.3"
version = "0.29.4"
license = {file = "LICENSE"}
authors = [
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
Expand Down
46 changes: 11 additions & 35 deletions src/json_repair/json_context.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum, auto
from typing import List
from typing import List, Optional


class ContextValues(Enum):
Expand All @@ -11,6 +11,8 @@ class ContextValues(Enum):
class JsonContext:
def __init__(self) -> None:
self.context: List[ContextValues] = []
self.current: Optional[ContextValues] = None
self.empty: bool = True

def set(self, value: ContextValues) -> None:
"""
Expand All @@ -25,6 +27,8 @@ def set(self, value: ContextValues) -> None:
# If a value is provided update the context variable and save in stack
if value:
self.context.append(value)
self.current = value
self.empty = False

def reset(self) -> None:
"""
Expand All @@ -33,37 +37,9 @@ def reset(self) -> None:
Returns:
None
"""
self.context.pop()

def is_current(self, context: ContextValues) -> bool:
"""
Check if the given context is the current (most recent) context.
Args:
context (ContextValues): The context value to check.
Returns:
bool: True if the given context is the same as the most recent context in the stack, False otherwise.
"""
return self.context[-1] == context

def is_any(self, context: ContextValues) -> bool:
"""
Check if the given context exists anywhere in the context stack.
Args:
context (ContextValues): The context value to check.
Returns:
bool: True if the given context exists in the stack, False otherwise.
"""
return context in self.context

def is_empty(self) -> bool:
"""
Check if the context stack is empty.
Returns:
bool: True if the context stack is empty, False otherwise.
"""
return len(self.context) == 0
try:
self.context.pop()
self.current = self.context[-1]
except IndexError:
self.current = None
self.empty = True
66 changes: 27 additions & 39 deletions src/json_repair/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,10 @@ def parse_json(
)
return ""
# <string> starts with a quote
elif not self.context.is_empty() and (
char in ['"', "'", "“"] or char.isalpha()
):
elif not self.context.empty and (char in ['"', "'", "“"] or char.isalpha()):
return self.parse_string()
# <number> starts with [0-9] or minus
elif not self.context.is_empty() and (
elif not self.context.empty and (
char.isdigit() or char == "-" or char == "."
):
return self.parse_number()
Expand Down Expand Up @@ -235,8 +233,9 @@ def parse_string(self) -> Union[str, bool, None]:
elif char.isalnum():
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
# But remember, object keys are only of type string
if char.lower() in ["t", "f", "n"] and not self.context.is_current(
ContextValues.OBJECT_KEY
if (
char.lower() in ["t", "f", "n"]
and self.context.current != ContextValues.OBJECT_KEY
):
value = self.parse_boolean_or_null()
if value != "":
Expand All @@ -256,15 +255,13 @@ def parse_string(self) -> Union[str, bool, None]:
if self.get_char_at() == lstring_delimiter:
# If it's an empty key, this was easy
if (
self.context.is_current(ContextValues.OBJECT_KEY)
self.context.current == ContextValues.OBJECT_KEY
and self.get_char_at(1) == ":"
):
self.index += 1
return ""
# Find the next delimiter
i = self.skip_to_character(
character=rstring_delimiter, idx=1, move_main_index=False
)
i = self.skip_to_character(character=rstring_delimiter, idx=1)
next_c = self.get_char_at(i)
# Now check that the next character is also a delimiter to ensure that we have "".....""
# In that case we ignore this rstring delimiter
Expand Down Expand Up @@ -297,22 +294,20 @@ def parse_string(self) -> Union[str, bool, None]:
while char and char != rstring_delimiter:
if (
missing_quotes
and self.context.is_current(ContextValues.OBJECT_KEY)
and self.context.current == ContextValues.OBJECT_KEY
and (char == ":" or char.isspace())
):
self.log(
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
)
break
if self.context.is_current(ContextValues.OBJECT_VALUE) and char in [
if self.context.current == ContextValues.OBJECT_VALUE and char in [
",",
"}",
]:
rstring_delimiter_missing = True
# check if this is a case in which the closing comma is NOT missing instead
i = self.skip_to_character(
character=rstring_delimiter, idx=1, move_main_index=False
)
i = self.skip_to_character(character=rstring_delimiter, idx=1)
next_c = self.get_char_at(i)
if next_c:
i += 1
Expand Down Expand Up @@ -346,8 +341,9 @@ def parse_string(self) -> Union[str, bool, None]:
"While parsing a string, we found a doubled quote, ignoring it"
)
self.index += 1
elif missing_quotes and self.context.is_current(
ContextValues.OBJECT_VALUE
elif (
missing_quotes
and self.context.current == ContextValues.OBJECT_VALUE
):
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
i = 1
Expand Down Expand Up @@ -388,34 +384,33 @@ def parse_string(self) -> Union[str, bool, None]:
# If we are in an object context, let's check for the right delimiters
if (
(
self.context.is_any(ContextValues.OBJECT_KEY)
ContextValues.OBJECT_KEY in self.context.context
and next_c in [":", "}"]
)
or (
self.context.is_any(ContextValues.OBJECT_VALUE)
ContextValues.OBJECT_VALUE in self.context.context
and next_c == "}"
)
or (
self.context.is_any(ContextValues.ARRAY)
ContextValues.ARRAY in self.context.context
and next_c in ["]", ","]
)
or (
check_comma_in_object_value
and self.context.is_current(ContextValues.OBJECT_VALUE)
and self.context.current == ContextValues.OBJECT_VALUE
and next_c == ","
)
):
break
i += 1
next_c = self.get_char_at(i)
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
if next_c == "," and self.context.is_current(
ContextValues.OBJECT_VALUE
if (
next_c == ","
and self.context.current == ContextValues.OBJECT_VALUE
):
i += 1
i = self.skip_to_character(
character=rstring_delimiter, idx=i, move_main_index=False
)
i = self.skip_to_character(character=rstring_delimiter, idx=i)
next_c = self.get_char_at(i)
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
i += 1
Expand All @@ -430,15 +425,13 @@ def parse_string(self) -> Union[str, bool, None]:
self.index += 1
char = self.get_char_at()
elif next_c == rstring_delimiter:
if self.context.is_current(ContextValues.OBJECT_VALUE):
if self.context.current == ContextValues.OBJECT_VALUE:
# But this might not be it! This could be just a missing comma
# We found a delimiter and we need to check if this is a key
# so find a rstring_delimiter and a colon after
i += 1
i = self.skip_to_character(
character=rstring_delimiter,
idx=i,
move_main_index=False,
character=rstring_delimiter, idx=i
)
i += 1
next_c = self.get_char_at(i)
Expand All @@ -463,7 +456,7 @@ def parse_string(self) -> Union[str, bool, None]:
if (
char
and missing_quotes
and self.context.is_current(ContextValues.OBJECT_KEY)
and self.context.current == ContextValues.OBJECT_KEY
and char.isspace()
):
self.log(
Expand All @@ -489,7 +482,7 @@ def parse_number(self) -> Union[float, int, str, JSONReturnType]:
number_str = ""
number_chars = set("0123456789-.eE/,")
char = self.get_char_at()
is_array = self.context.is_current(ContextValues.ARRAY)
is_array = self.context.current == ContextValues.ARRAY
while char and char in number_chars and (char != "," or not is_array):
number_str += char
self.index += 1
Expand Down Expand Up @@ -562,9 +555,7 @@ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
return idx
return idx

def skip_to_character(
self, character: str, idx: int = 0, move_main_index=True
) -> int:
def skip_to_character(self, character: str, idx: int = 0) -> int:
"""
This function quickly iterates to find a character, syntactic sugar to make the code more concise
"""
Expand All @@ -573,10 +564,7 @@ def skip_to_character(
except IndexError:
return idx
while char != character:
if move_main_index: # pragma: no cover
self.index += 1
else:
idx += 1
idx += 1
try:
char = self.json_str[self.index + idx]
except IndexError:
Expand Down
12 changes: 6 additions & 6 deletions tests/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_true_true_correct(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
Expand All @@ -31,7 +31,7 @@ def test_true_true_incorrect(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
Expand All @@ -53,7 +53,7 @@ def test_true_false_incorrect(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
Expand All @@ -64,7 +64,7 @@ def test_false_true_correct(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
Expand All @@ -75,7 +75,7 @@ def test_false_true_incorrect(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
Expand All @@ -97,7 +97,7 @@ def test_false_false_incorrect(benchmark):
mean_time = benchmark.stats.get("median")

# Define your time threshold in seconds
max_time = 2 / 10 ** 3 # 2 millisecond
max_time = 1.8 / 10 ** 3 # 1.8 millisecond

# Assert that the average time is below the threshold
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"

0 comments on commit bdbea5f

Please sign in to comment.