Skip to content

Commit

Permalink
fix(dehtml): do not insert unnecessary newlines when parsing <p> tags
Browse files Browse the repository at this point in the history
Previously, parsing of `<p>Foo</p><p>Bar</p>`
resulted in `\n\nFoo\n\n\n\nBar\n\n`.

Now it results in `Foo\n\nBar`.
  • Loading branch information
link2xt committed Jun 16, 2023
1 parent 92e34d6 commit 00cb72f
Showing 1 changed file with 19 additions and 2 deletions.
21 changes: 19 additions & 2 deletions src/dehtml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
.to_lowercase();

match tag.as_str() {
"p" | "table" | "td" | "style" | "script" | "title" | "pre" => {
"style" | "script" | "title" | "pre" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
Expand Down Expand Up @@ -200,7 +200,9 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(

match tag.as_str() {
"p" | "table" | "td" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
if !dehtml.strbuilder.is_empty() {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
}
dehtml.add_text = AddText::YesRemoveLineEnds;
}
#[rustfmt::skip]
Expand Down Expand Up @@ -353,6 +355,21 @@ mod tests {
assert_eq!(plain, "line1\n\r\r\rline2\nline3");
}

#[test]
fn test_dehtml_parse_p() {
let html = "<p>Foo</p><p>Bar</p>";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");

let html = "<p>Foo<p>Bar";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");

let html = "<p>Foo</p><p>Bar<p>Baz";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar\n\nBaz");
}

#[test]
fn test_dehtml_parse_href() {
let html = "<a href=url>text</a";
Expand Down

0 comments on commit 00cb72f

Please sign in to comment.