diff --git a/pyproject.toml b/pyproject.toml index 0202425..9182e21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] name = "json_repair" -version = "0.33.0" +version = "0.34.0" license = {file = "LICENSE"} authors = [ { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" }, diff --git a/src/json_repair/json_parser.py b/src/json_repair/json_parser.py index 1b253b0..34d1eaf 100644 --- a/src/json_repair/json_parser.py +++ b/src/json_repair/json_parser.py @@ -7,6 +7,9 @@ class JSONParser: + # Constants + STRING_DELIMITERS = ['"', "'", "“", "”"] + def __init__( self, json_str: Union[str, StringFileWrapper], @@ -89,7 +92,9 @@ def parse_json( ) return "" # starts with a quote - elif not self.context.empty and (char in ['"', "'", "“"] or char.isalpha()): + elif not self.context.empty and ( + char in self.STRING_DELIMITERS or char.isalpha() + ): return self.parse_string() # starts with [0-9] or minus elif not self.context.empty and ( @@ -130,6 +135,8 @@ def parse_object(self) -> Dict[str, JSONReturnType]: # starts with a key = "" while self.get_char_at(): + # The rollback index needs to be updated here in case the key is empty + rollback_index = self.index key = str(self.parse_string()) if key != "" or (key == "" and self.get_char_at() == ":"): @@ -140,6 +147,12 @@ def parse_object(self) -> Dict[str, JSONReturnType]: "While parsing an object we found a duplicate key, closing the object here and rolling back the index", ) self.index = rollback_index - 1 + # add an opening curly brace to make this work + self.json_str = ( + self.json_str[: self.index + 1] + + "{" + + self.json_str[self.index + 1 :] + ) break # Skip filler whitespaces @@ -227,7 +240,7 @@ def parse_string(self) -> Union[str, bool, None]: char = self.get_char_at() # A valid string can only start with a valid quote or, in our case, with a literal - while char and char not in ['"', "'", "“"] and not char.isalnum(): + while char and char not in self.STRING_DELIMITERS and not char.isalnum(): self.index += 1 char = self.get_char_at() @@ -262,35 +275,61 @@ def parse_string(self) -> Union[str, bool, None]: if not missing_quotes: self.index += 1 + self.skip_whitespaces_at() # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop - if self.get_char_at() == lstring_delimiter: - # If it's an empty key, this was easy - if ( - self.context.current == ContextValues.OBJECT_KEY - and self.get_char_at(1) == ":" - ): - self.index += 1 - return "" - # Find the next delimiter - i = self.skip_to_character(character=rstring_delimiter, idx=1) - next_c = self.get_char_at(i) - # Now check that the next character is also a delimiter to ensure that we have ""....."" - # In that case we ignore this rstring delimiter - if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: - self.log( - "While parsing a string, we found a valid starting doubled quote, ignoring it", - ) - doubled_quotes = True - self.index += 1 - else: - # Ok this is not a doubled quote, check if this is an empty string or not - i = self.skip_whitespaces_at(idx=1, move_main_index=False) + if self.get_char_at() in self.STRING_DELIMITERS: + # If the next character is the same type of quote, then we manage it as double quotes + if self.get_char_at() == lstring_delimiter: + # If it's an empty key, this was easy + if ( + self.context.current == ContextValues.OBJECT_KEY + and self.get_char_at(1) == ":" + ): + self.index += 1 + return "" + if self.get_char_at(1) == lstring_delimiter: + # There's something fishy about this, we found doubled quotes and then again quotes + self.log( + "While parsing a string, we found a doubled quote and then a quote again, ignoring it", + ) + return "" + # Find the next delimiter + i = self.skip_to_character(character=rstring_delimiter, idx=1) next_c = self.get_char_at(i) - if next_c not in [",", "]", "}"]: + # Now check that the next character is also a delimiter to ensure that we have ""....."" + # In that case we ignore this rstring delimiter + if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: self.log( - "While parsing a string, we found a doubled quote but it was a mistake, removing one quote", + "While parsing a string, we found a valid starting doubled quote", ) + doubled_quotes = True self.index += 1 + else: + # Ok this is not a doubled quote, check if this is an empty string or not + i = self.skip_whitespaces_at(idx=1, move_main_index=False) + next_c = self.get_char_at(i) + if next_c in self.STRING_DELIMITERS + ["{", "["]: + # something fishy is going on here + self.log( + "While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it", + ) + self.index += 1 + return "" + elif next_c not in [",", "]", "}"]: + self.log( + "While parsing a string, we found a doubled quote but it was a mistake, removing one quote", + ) + self.index += 1 + else: + # Otherwise we need to do another check before continuing + i = self.skip_to_character(character=rstring_delimiter, idx=1) + next_c = self.get_char_at(i) + if not next_c: + # mmmm that delimiter never appears again, this is a mistake + self.log( + "While parsing a string, we found a quote but it was a mistake, ignoring it", + ) + return "" # Initialize our return value string_acc = "" diff --git a/src/json_repair/string_file_wrapper.py b/src/json_repair/string_file_wrapper.py index 00a5230..4a7bddb 100644 --- a/src/json_repair/string_file_wrapper.py +++ b/src/json_repair/string_file_wrapper.py @@ -96,3 +96,24 @@ def __len__(self) -> int: self.length = self.fd.tell() self.fd.seek(current_position) return self.length + + def __setitem__(self, index: Union[int, slice], value: str) -> None: + """ + Set a character or a slice of characters in the file. + + Args: + index (slice): The slice of characters to set. + value (str): The value to set at the specified index or slice. + """ + if isinstance(index, slice): + start = index.start or 0 + else: + start = index or 0 + + if start < 0: + start += len(self) + + current_position = self.fd.tell() + self.fd.seek(start) + self.fd.write(value) + self.fd.seek(current_position) diff --git a/tests/test_json_repair.py b/tests/test_json_repair.py index f8f976a..882eeae 100644 --- a/tests/test_json_repair.py +++ b/tests/test_json_repair.py @@ -146,14 +146,14 @@ def test_object_edge_cases(): assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}' assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}' assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}' - assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", " key2": "value2"}' + assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}' assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}' assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}' assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}' assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}' assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}' assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}' - assert repair_json('[{"lorem": {"ipsum": "sic"}, "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, "lorem", {"ipsum": "sic"}]' + assert repair_json('[{"lorem": {"ipsum": "sic"}, """" "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, {"lorem": {"ipsum": "sic"}}]' def test_number_edge_cases(): assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'