Skip to content

Commit d196918

Browse files
nqnkxtran
andauthored
[ENG-857] Filter large image messages (#190)
* Filter out large images from logs * Filter out single images larger than 1MB * Formatting * Bump image size limit to 4MB * Update magentic test to assert completion id in the platform --------- Co-authored-by: Kim Tran <ksprtran@gmail.com>
1 parent feb8001 commit d196918

File tree

3 files changed

+108
-1
lines changed

3 files changed

+108
-1
lines changed

log10/load.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,36 @@ def _process_chunk(self, chunk):
473473
logger.error(f"Failed to insert in log10: {self.partial_log_row} with error {res.text}. Skipping")
474474

475475

476+
# Filter large images from messages, and replace with a text message saying "Image too large to display"
477+
def filter_large_images(messages):
478+
for message in messages:
479+
# Content may be an array of fragments, of text and images.
480+
# If not, it's a single fragment.
481+
if isinstance(message.get("content"), list):
482+
new_content = []
483+
for fragment in message.get("content", ""):
484+
if fragment.get("type") == "image_url":
485+
# If image is more than 4MB, replace with a text message
486+
url = fragment.get("image_url", {}).get("url", "")
487+
if url.startswith("data:image"):
488+
if len(url) > 4e6:
489+
new_content.append(
490+
{
491+
"type": "text",
492+
"text": "Image too large to capture",
493+
}
494+
)
495+
else:
496+
new_content.append(fragment)
497+
else:
498+
new_content.append(fragment)
499+
else:
500+
new_content.append(fragment)
501+
message["content"] = new_content
502+
503+
return messages
504+
505+
476506
def flatten_messages(messages):
477507
flat_messages = []
478508
for message in messages:
@@ -524,7 +554,7 @@ def _init_log_row(func, *args, **kwargs):
524554
# We may have to flatten messages from their ChatCompletionMessage with nested ChatCompletionMessageToolCall to json serializable format
525555
# Rewrite in-place
526556
if "messages" in kwargs_copy:
527-
kwargs_copy["messages"] = flatten_messages(kwargs_copy["messages"])
557+
kwargs_copy["messages"] = filter_large_images(flatten_messages(kwargs_copy["messages"]))
528558

529559
# kind and request are set based on the module and qualname
530560
# request is based on openai schema

tests/test_large_images.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import copy
2+
3+
from log10.load import filter_large_images
4+
5+
6+
def test_empty_messages():
7+
assert filter_large_images([]) == []
8+
9+
10+
# Test for regular messages without fragments i.e. content isn't a list.
11+
def test_non_fragment_messages():
12+
messages = [
13+
{"content": "This is a message.", "role": "system"},
14+
{"content": "This is another message.", "role": "user"},
15+
]
16+
assert filter_large_images(copy.deepcopy(messages)) == messages
17+
18+
19+
# Test for a message with a fragment that is not an image.
20+
def test_non_image_fragment():
21+
messages = [
22+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
23+
{
24+
"content": [{"type": "text", "text": "This is another message."}],
25+
"role": "user",
26+
},
27+
]
28+
assert filter_large_images(copy.deepcopy(messages)) == messages
29+
30+
31+
def test_small_image_fragment():
32+
messages = [
33+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
34+
{
35+
"content": [
36+
{
37+
"type": "image_url",
38+
"image_url": {"url": "https://example.com/image.png"},
39+
}
40+
],
41+
"role": "system",
42+
},
43+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
44+
]
45+
assert filter_large_images(copy.deepcopy(messages)) == messages
46+
47+
48+
def test_large_image_fragment():
49+
large_string = "a" * int(4e6)
50+
before_messages = [
51+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
52+
{
53+
"content": [
54+
{
55+
"type": "image_url",
56+
"image_url": {"url": f"data:image/jpeg;base64,{large_string}"},
57+
}
58+
],
59+
"role": "system",
60+
},
61+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
62+
]
63+
after_messages = [
64+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
65+
{
66+
"content": [
67+
{
68+
"type": "text",
69+
"text": "Image too large to capture",
70+
},
71+
],
72+
"role": "system",
73+
},
74+
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
75+
]
76+
assert filter_large_images(before_messages) == after_messages

tests/test_magentic.py

+1
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,4 @@ def _llm() -> str: ...
208208

209209
output = _llm()
210210
assert isinstance(output, str)
211+
_LogAssertion(completion_id=session.last_completion_id(), message_content=output).assert_chat_response()

0 commit comments

Comments
 (0)