Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add conversion for HTML to markdown #932

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 142 additions & 3 deletions apprise/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content):
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
# For now; use same converter for Markdown support
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
}

convert = converters.get((from_format, to_format))
Expand Down Expand Up @@ -86,12 +85,23 @@ def html_to_text(content):
return parser.converted


def html_to_markdown(content):
"""
Converts a content from HTML to markdown.
"""

parser = HTMLMarkDownConverter()
parser.feed(content)
parser.close()
return parser.converted


class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages."""

# The following tags must start on a new line
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)

# the folowing tags ignore any internal text
IGNORE_TAGS = (
Expand Down Expand Up @@ -200,3 +210,132 @@ def handle_endtag(self, tag):

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)


class HTMLMarkDownConverter(HTMLConverter):
"""An HTML to markdown converter tuned for email messages."""

# Escape markdown characters
MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)

# Detect Carriage Return
HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)

def __init__(self, **kwargs):
super().__init__(**kwargs)

# Store href value
self._link = ""

self._preserver_cr = False

def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
"""

# initialize our previous flag
if self._do_store:

# Tidy our whitespace
content = self.WS_TRIM.sub(' ', data) \
if not self._preserver_cr else data
content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)

# Add hyperlink
if self._link == "":
self._result.append(content)
else:
self._result.append("[" + content + "]" + self._link)

def handle_starttag(self, tag, attrs):
"""
Process our starting HTML Tag
"""
# Toggle initial states
self._do_store = tag not in self.IGNORE_TAGS
self._link = ""

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

if tag == 'li':
self._result.append('- ')

elif tag == 'br':
self._result.append('\n')

elif tag == 'hr':
if self._result:
self._result[-1] = self._result[-1].rstrip(' ')

self._result.append('\n---\n')

elif tag == 'blockquote':
self._result.append('> ')

elif tag == 'h1':
self._result.append('# ')

elif tag == 'h2':
self._result.append('## ')

elif tag == 'h3':
self._result.append('### ')

elif tag == 'h4':
self._result.append('#### ')

elif tag == 'h5':
self._result.append('##### ')

elif tag == 'h6':
self._result.append('###### ')

elif tag in ('strong', 'b'):
self._result.append('**')

elif tag in ('em', 'i'):
self._result.append('*')

elif tag == 'code':
self._result.append('`')
self._preserver_cr = True

elif tag in ('pre', 'samp'):
self._result.append('```')
self._result.append(self.BLOCK_END)
self._preserver_cr = True

elif tag == 'a':
for name, link in attrs: # pragma: no branch
if name == 'href':
self._link = '(' + link + ')'
# Take an early exit for speed (in case there are more
# parameters - no need to waste time looking at them)
break

def handle_endtag(self, tag):
"""
Edge case handling of open/close tags
"""
self._do_store = True
self._link = ""

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

if tag in ('strong', 'b'):
self._result.append('**')

elif tag in ('em', 'i'):
self._result.append('*')

elif tag == 'code':
self._result.append('`')
self._preserver_cr = False

elif tag in ('pre', 'samp'):
self._result.append('```')
self._result.append(self.BLOCK_END)
self._preserver_cr = False
125 changes: 125 additions & 0 deletions test/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ def to_html(body):
"<a href='#'>my link</a>") == \
"test my link"

# a with missing href entry
assert to_html("<span></span<<span>test</span> "
"<a>my link</a>") == \
"test my link"

# </p> missing
assert to_html("<body><div>line 1 <b>bold</b></div> "
" <a href='#'>my link</a>"
Expand Down Expand Up @@ -143,6 +148,126 @@ def to_html(body):
assert to_html(object)


def test_conversion_html_to_markdown():
"""conversion: Test HTML to plain text
"""

def to_markdown(body):
"""
A function to simply html conversion tests
"""
return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)

assert to_markdown("No HTML code here.") == "No HTML code here."

clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
assert "- Lots and lots" in clist
assert "- of lists." in clist

assert "> To be or not to be." == to_markdown(
"<blockquote>To be or not to be.</blockquote>")

cspace = to_markdown(
"<h2>Fancy heading</h2>"
"<p>And a paragraph too.<br>Plus line break.</p>")
assert "# Fancy heading" in cspace
assert "And a paragraph too.\nPlus line break." in cspace

assert to_markdown(
"<style>body { font: 200%; }</style>"
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here."

assert to_markdown(
"<p>line 1</p>"
"<p>line 2</p>"
"<p>line 3</p>") == "line 1\nline 2\nline 3"

# Case sensitivity
assert to_markdown(
"<p>line 1</P>"
"<P>line 2</P>"
"<P>line 3</P>") == "line 1\nline 2\nline 3"

# double new lines (testing <br> and </br>)
assert to_markdown(
"some information<br/><br>and more information") == \
"some information\n\nand more information"

#
# Test bad tags
#

# first 2 entries are okay, but last will do as best as it can
assert to_markdown(
"<h1>Heading 1</h1>"
"<h2>Heading 2</h2>"
"<h3>Heading 3</h3>"
"<h4>Heading 4</h4>"
"<h5>Heading 5</h5>"
"<h6>Heading 6</h6>"
"<p>line 1</>"
"<p><em>line 2</em></gar>"
"<p>line 3>") == \
"# Heading 1\n## Heading 2\n### Heading 3\n" \
"#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
"line 1\n*line 2*\nline 3>"

# Make sure we ignore fields that aren't important to us
assert to_markdown(
"<script>ignore this</script>"
"<p>line 1</p>"
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"

# Test <code> and <pre>
assert to_markdown(
"<code>multi-line 1\nmulti-line 2</code>more content"
"<pre>multi-line 1\nmulti-line 2</pre>more content") == \
'`multi-line 1\nmulti-line 2`more content' \
'\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'

# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_markdown("<span></span<<span>test</span> "
"<a href='#'>my link</a>") == \
"test [my link](#)"

# </p> missing
assert to_markdown("<body><div>line 1 <b>bold</b></div> "
" <a href='/link'>my link</a>"
"<p>3rd line</body>") == \
"line 1 **bold**\n[my link](/link)\n3rd line"

# <hr/> on it's own
assert to_markdown("<hr/>") == "---"
assert to_markdown("<hr>") == "---"

# We need to handle HTML Encodings
assert to_markdown("""
<html>
<title>ignore this entry</title>
<body>
Let&apos;s handle&nbsp;special html encoding
<hr/>
</body>
""") == "Let's handle special html encoding\n---"

# If you give nothing, you get nothing in return
assert to_markdown("") == ""

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(None)

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(42)

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(object)


def test_conversion_text_to():
"""conversion: Test Text to all types
"""
Expand Down