Skip to content

Commit

Permalink
Merge pull request #168 from wrabit/regex_compiler
Browse files Browse the repository at this point in the history
Regex compiler
  • Loading branch information
wrabit authored Sep 29, 2024
2 parents 656d443 + 9be11c3 commit ca4ddde
Show file tree
Hide file tree
Showing 14 changed files with 588 additions and 356 deletions.
2 changes: 1 addition & 1 deletion dev/example_project/render_load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def main():
configure_django()

runs = 5
iterations = 500
iterations = 5000

print(f"Running benchmarks with {runs} runs, {iterations} iterations each")

Expand Down
14 changes: 13 additions & 1 deletion django_cotton/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
App configuration to set up the cotton loader and builtins automatically.
"""

import re
from contextlib import suppress

import django.contrib.admin
Expand Down Expand Up @@ -65,6 +65,11 @@ class LoaderAppConfig(AppConfig):
default = True

def ready(self):
from django.template import base

# Support for multiline tags
base.tag_re = re.compile(base.tag_re.pattern, re.DOTALL)

wrap_loaders("django")


Expand All @@ -75,3 +80,10 @@ class SimpleAppConfig(AppConfig):
"""

name = "django_cotton"

def ready(self):
pass
from django.template import base

# Support for multiline tags
base.tag_re = re.compile(base.tag_re.pattern, re.DOTALL)
326 changes: 326 additions & 0 deletions django_cotton/compiler_bs4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
import random
import re
import warnings
from html.parser import HTMLParser

from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from bs4.builder._htmlparser import BeautifulSoupHTMLParser, HTMLParserTreeBuilder
from bs4.formatter import HTMLFormatter

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)


class UnsortedAttributes(HTMLFormatter):
"""This keeps BS4 from re-ordering attributes"""

def attributes(self, tag):
for k, v in tag.attrs.items():
yield k, v


class CottonHTMLParser(BeautifulSoupHTMLParser):
"""Extending the default HTML parser to override handle_starttag so we can preserve the intended value of the
attribute from the developer so that we can differentiate boolean attributes and simply empty ones.
"""

def __init__(self, tree_builder, soup, on_duplicate_attribute):
# Initialize the parent class (HTMLParser) without additional arguments
HTMLParser.__init__(self)
self._first_processing_instruction = None
self.tree_builder = tree_builder
self.soup = soup
self._root_tag = None # Initialize _root_tag
self.already_closed_empty_element = [] # Initialize this list
self.on_duplicate_attribute = (
on_duplicate_attribute # You can set this according to your needs
)
self.IGNORE = "ignore"
self.REPLACE = "replace"

def handle_starttag(self, name, attrs, handle_empty_element=True):
"""Handle an opening tag, e.g. '<tag>'"""
attr_dict = {}
for key, value in attrs:
# Cotton edit: We want to permit valueless / "boolean" attributes
# if value is None:
# value = ''

if key in attr_dict:
on_dupe = self.on_duplicate_attribute
if on_dupe == self.IGNORE:
pass
elif on_dupe in (None, self.REPLACE):
attr_dict[key] = value
else:
on_dupe(attr_dict, key, value)
else:
attr_dict[key] = value
sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos
)
if tag and tag.is_empty_element and handle_empty_element:
self.handle_endtag(name, check_already_closed=False)
self.already_closed_empty_element.append(name)

# Cotton edit: We do not need to validate the root element
# if self._root_tag is None:
# self._root_tag_encountered(name)


class CottonHTMLTreeBuilder(HTMLParserTreeBuilder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.handle_duplicate_attributes = kwargs.get("on_duplicate_attribute", None)
self.parser_class = CottonHTMLParser

def feed(self, markup):
parser = self.parser_class(self, self.soup, self.handle_duplicate_attributes)
parser.feed(markup)
parser.close()


class CottonBs4Compiler:
DJANGO_SYNTAX_PLACEHOLDER_PREFIX = "__django_syntax__"
COTTON_VERBATIM_PATTERN = re.compile(
r"\{% cotton_verbatim %\}(.*?)\{% endcotton_verbatim %\}", re.DOTALL
)
DJANGO_TAG_PATTERN = re.compile(r"(\s?)(\{%.*?%\})(\s?)")
DJANGO_VAR_PATTERN = re.compile(r"(\s?)(\{\{.*?\}\})(\s?)")
HTML_ENTITY_PATTERN = re.compile(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[a-fA-F0-9]+;")

def __init__(self):
self.django_syntax_placeholders = []
self.html_entity_placeholders = []

def process(self, content: str):
processors = [
self._replace_syntax_with_placeholders,
self._replace_html_entities_with_placeholders,
self._compile_cotton_to_django,
self._replace_placeholders_with_syntax,
self._replace_placeholders_with_html_entities,
self._remove_duplicate_attribute_markers,
]

for processor in processors:
# noinspection PyArgumentList
content = processor(content)

return content

def _replace_html_entities_with_placeholders(self, content):
"""Replace HTML entities with placeholders so they dont get touched by BS4"""

def replace_entity(match):
entity = match.group(0)
self.html_entity_placeholders.append(entity)
return f"__HTML_ENTITY_{len(self.html_entity_placeholders) - 1}__"

return self.HTML_ENTITY_PATTERN.sub(replace_entity, content)

def _replace_placeholders_with_html_entities(self, content: str):
for i, entity in enumerate(self.html_entity_placeholders):
content = content.replace(f"__HTML_ENTITY_{i}__", entity)
return content

def _replace_syntax_with_placeholders(self, content: str):
"""Replace {% ... %} and {{ ... }} with placeholders so they dont get touched
or encoded by bs4. We will replace them back after bs4 has done its job."""
self.django_syntax_placeholders = []

def replace_pattern(pattern, replacement_func):
return pattern.sub(replacement_func, content)

def replace_cotton_verbatim(match):
"""{% cotton_verbatim %} protects the content through the bs4 parsing process when we want to actually print
cotton syntax in <pre> blocks."""
inner_content = match.group(1)
self.django_syntax_placeholders.append({"type": "verbatim", "content": inner_content})
return (
f"{self.DJANGO_SYNTAX_PLACEHOLDER_PREFIX}{len(self.django_syntax_placeholders)}__"
)

def replace_django_syntax(match):
"""Store if the match had at least one space on the left or right side of the syntax so we can restore it later"""
left_space, syntax, right_space = match.groups()
self.django_syntax_placeholders.append(
{
"type": "django",
"content": syntax,
"left_space": bool(left_space),
"right_space": bool(right_space),
}
)
return (
f" {self.DJANGO_SYNTAX_PLACEHOLDER_PREFIX}{len(self.django_syntax_placeholders)}__ "
)

# Replace cotton_verbatim blocks
content = replace_pattern(self.COTTON_VERBATIM_PATTERN, replace_cotton_verbatim)

# Replace {% ... %}
content = replace_pattern(self.DJANGO_TAG_PATTERN, replace_django_syntax)

# Replace {{ ... }}
content = replace_pattern(self.DJANGO_VAR_PATTERN, replace_django_syntax)

return content

def _compile_cotton_to_django(self, content: str):
"""Convert cotton <c-* syntax to {%."""
soup = self._make_soup(content)

if cvars_el := soup.find("c-vars"):
soup = self._wrap_with_cotton_vars_frame(soup, cvars_el)

self._transform_components(soup)

return str(soup.encode(formatter=UnsortedAttributes()).decode("utf-8"))

def _replace_placeholders_with_syntax(self, content: str):
"""Replace placeholders with original syntax."""
for i, placeholder in enumerate(self.django_syntax_placeholders, 1):
if placeholder["type"] == "verbatim":
placeholder_pattern = f"{self.DJANGO_SYNTAX_PLACEHOLDER_PREFIX}{i}__"
content = content.replace(placeholder_pattern, placeholder["content"])
else:
"""
Construct the regex pattern based on original whitespace. This is to avoid unnecessary whitespace
changes in the output that can lead to unintended tag type mutations,
i.e. <div{% expr %}></div> --> <div__placeholder></div__placeholder> --> <div{% expr %}></div{% expr %}>
"""
left_group = r"( ?)" if not placeholder["left_space"] else ""
right_group = r"( ?)" if not placeholder["right_space"] else ""
placeholder_pattern = (
f"{left_group}{self.DJANGO_SYNTAX_PLACEHOLDER_PREFIX}{i}__{right_group}"
)

content = re.sub(placeholder_pattern, placeholder["content"], content)

return content

def _remove_duplicate_attribute_markers(self, content: str):
return re.sub(r"__COTTON_DUPE_ATTR__[0-9A-F]{5}", "", content, flags=re.IGNORECASE)

def _wrap_with_cotton_vars_frame(self, soup, cvars_el):
"""If the user has defined a <c-vars> tag, wrap content with {% cotton_vars_frame %} to be able to create and
govern vars and attributes. To be able to defined new vars within a component and also have them available in the
same component's context, we wrap the entire contents in another component: cotton_vars_frame. Only when <c-vars>
is present."""

cvars_attrs = []
for k, v in cvars_el.attrs.items():
if v is None:
cvars_attrs.append(k)
else:
if k == "class":
v = " ".join(v)
cvars_attrs.append(f'{k}="{v}"')

cvars_el.decompose()
opening = f"{{% vars {' '.join(cvars_attrs)} %}}"
opening = opening.replace("\n", "")
closing = "{% endvars %}"

# Convert the remaining soup back to a string and wrap it within {% with %} block
wrapped_content = (
opening
+ str(soup.encode(formatter=UnsortedAttributes()).decode("utf-8")).strip()
+ closing
)
new_soup = self._make_soup(wrapped_content)
return new_soup

def _transform_components(self, soup):
"""Replace <c-[component path]> tags with the {% cotton_component %} template tag"""
for tag in soup.find_all(re.compile("^c-"), recursive=True):
if tag.name == "c-slot":
self._transform_named_slot(tag)

continue

component_key = tag.name[2:]
opening_tag = f"{{% c {component_key} "

# Store attributes that contain template expressions, they are when we use '{{' or '{%' in the value of an attribute
complex_attrs = []

# Build the attributes
for key, value in tag.attrs.items():
# value might be None
if value is None:
opening_tag += f" {key}"
continue

# BS4 stores class values as a list, so we need to join them back into a string
if key == "class":
value = " ".join(value)

# Django templates tags cannot have {{ or {% expressions in their attribute values
# Neither can they have new lines, let's treat them both as "expression attrs"
if self.DJANGO_SYNTAX_PLACEHOLDER_PREFIX in value or "\n" in value or "=" in value:
complex_attrs.append((key, value))
continue

opening_tag += ' {}="{}"'.format(key, value)
opening_tag += " %}"

component_tag = opening_tag

if complex_attrs:
for key, value in complex_attrs:
component_tag += f"{{% attr {key} %}}{value}{{% endattr %}}"

if tag.contents:
tag_soup = self._make_soup(tag.decode_contents(formatter=UnsortedAttributes()))
self._transform_components(tag_soup)
component_tag += str(
tag_soup.encode(formatter=UnsortedAttributes()).decode("utf-8")
# tag_soup.decode_contents(formatter=UnsortedAttributes())
)

component_tag += "{% endc %}"

# Replace the original tag with the compiled django syntax
new_soup = self._make_soup(component_tag)
tag.replace_with(new_soup)

return soup

def _transform_named_slot(self, slot_tag):
"""Compile <c-slot> to {% slot %}"""
slot_name = slot_tag.get("name", "").strip()
inner_html = slot_tag.decode_contents(formatter=UnsortedAttributes())

# Check and process any components in the slot content
slot_soup = self._make_soup(inner_html)
self._transform_components(slot_soup)

cotton_slot_tag = f"{{% slot {slot_name} %}}{str(slot_soup.encode(formatter=UnsortedAttributes()).decode('utf-8'))}{{% endslot %}}"

slot_tag.replace_with(self._make_soup(cotton_slot_tag))

def _make_soup(self, content):
return BeautifulSoup(
content,
"html.parser",
builder=CottonHTMLTreeBuilder(on_duplicate_attribute=handle_duplicate_attributes),
)


def handle_duplicate_attributes(tag_attrs, key, value):
"""BS4 cleans html and removes duplicate attributes. This would be fine if our target was html, but actually
we're targeting Django Template Language. This contains expressions to govern content including attributes of
any XML-like tag. It's perfectly fine to expect duplicate attributes per tag in DTL:
<a href="#" {% if something %} class="this" {% else %} class="that" {% endif %}>Hello</a>
The solution here is to make duplicate attribute keys unique across that tag so BS4 will not attempt to merge or
replace existing. Then in post processing we'll remove the unique mask.
Todo - This could be simplified with a custom formatter
"""
key_id = "".join(random.choice("0123456789ABCDEF") for i in range(5))
key = f"{key}__COTTON_DUPE_ATTR__{key_id}"
tag_attrs[key] = value
Loading

0 comments on commit ca4ddde

Please sign in to comment.