Skip to content

Commit

Permalink
Merge pull request #31 from edx/rijuma/academic-16262
Browse files Browse the repository at this point in the history
[ACADEMIC-16262] Updated HTML parser to remove unwanted tag contents.
  • Loading branch information
rijuma authored Jul 5, 2023
2 parents fe60550 + a707118 commit 64bd7bc
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 4 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ Change Log
Unreleased
**********

2.0.2 – 2023-07-05
**********************************************

Fix
=====

* Updated HTML parser to remove tags with their content for specific cases like `<script>` or `<style>`.


2.0.1 – 2023-06-29
**********************************************

Expand Down
2 changes: 1 addition & 1 deletion ai_aside/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin containing xblocks and apps supporting GPT and other LLM use on edX.
"""

__version__ = '2.0.1'
__version__ = '2.0.2'
1 change: 1 addition & 0 deletions ai_aside/summaryhook_aside/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ def plugin_settings(settings):
settings.SUMMARY_HOOK_HOST = env_tokens.get('SUMMARY_HOOK_HOST', '')
settings.SUMMARY_HOOK_JS_PATH = env_tokens.get('SUMMARY_HOOK_JS_PATH', '')
settings.AISPOT_LMS_NAME = env_tokens.get('AISPOT_LMS_NAME', '')
settings.HTML_TAGS_TO_REMOVE = ['script', 'style']
14 changes: 12 additions & 2 deletions ai_aside/summaryhook_aside/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from html.parser import HTMLParser
from re import sub

from django.conf import settings


def cleanup_text(text):
"""
Expand All @@ -22,19 +24,27 @@ class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-m
"""
Helper function for html_to_text below
"""
_is_content = True

def __init__(self):
HTMLParser.__init__(self)
self.reset()
self.fed = []

def handle_starttag(self, tag, _):
"""This runs when a new tag is found. We use this to exclude unwanted content."""
tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None)
self._is_content = not (tags_to_filter and tag in tags_to_filter)

def handle_data(self, data):
"""takes the data in separate chunks"""
self.fed.append(data)
if self._is_content:
self.fed.append(data)

def handle_entityref(self, name):
"""appends the reference to the body"""
self.fed.append('&%s;' % name)
if self._is_content:
self.fed.append('&%s;' % name)

def get_data(self):
"""joins together the seperate chunks into one cohesive string"""
Expand Down
Empty file removed tests/__init__.py
Empty file.
47 changes: 46 additions & 1 deletion tests/summaryhook_aside/test_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_children(self):
return self.children


@override_settings(SUMMARY_HOOK_MIN_SIZE=40)
@override_settings(SUMMARY_HOOK_MIN_SIZE=40, HTML_TAGS_TO_REMOVE=['script', 'style', 'test'])
class TestSummaryHookAside(TestCase):
def setUp(self):
module_mock = MagicMock()
Expand Down Expand Up @@ -200,6 +200,51 @@ def test_parse_children_contents_with_valid_children_2(self):
}]
)

length, items = _parse_children_contents(block)
self.assertEqual(length, expected_length)
self.assertEqual(items, expected_items)

def test_parse_children_contents_with_script_or_style_tags(self):
children = [
FakeChild('html', '01', '''
<span>This should be the only text to be extracted.</span>
<test>For testing purposes only, this tag is ignored as well</test>
<script>
function ignore() {
console.log('This content should be ignored.');
}
</script>
<script type="text/javascript">
console.log('This should be ignored as well.')
</script>
<script src="https://nevermind.me/i-should-also-be-discarded.js" type="text/javascript"></script>'''),
FakeChild('html', '02', '''
<div class="cypher">Why oh why didn't I take the <em>BLUE</em> pill?</div>
<style>
.cypher em {
color: #00f;
}
</style>'''),
]
block = FakeBlock(children)

expected_length, expected_items = (
84,
[{
'definition_id': 'def-id-01',
'content_type': 'TEXT',
'content_text': 'This should be the only text to be extracted.',
'published_on': 'published-on-01',
'edited_on': 'edited-on-01',
}, {
'definition_id': 'def-id-02',
'content_type': 'TEXT',
'content_text': 'Why oh why didn\'t I take the BLUE pill?',
'published_on': 'published-on-02',
'edited_on': 'edited-on-02',
}]
)

length, items = _parse_children_contents(block)

self.assertEqual(length, expected_length)
Expand Down

0 comments on commit 64bd7bc

Please sign in to comment.