- Improved support for cmark 0.30 for ATX headings.

- Added license identifiers for cmark.
- Changed variable name according to the cmark 0.30 specs.
This commit is contained in:
Franco Masotti 2022-01-24 20:06:24 +01:00
parent 3c819ea781
commit e264f1d8da
Signed by: frnmst
GPG Key ID: 24116ED85666780A
4 changed files with 129 additions and 16 deletions

View File

@ -939,6 +939,7 @@ def get_atx_heading(line: str,
if len(subl) == 0 or subl[0] == '\u005c':
continue
# Preceding.
i = 0
while i < len(subl) and subl[i] == ' ' and i <= md_parser['github'][
'header']['max space indentation']:
@ -946,6 +947,7 @@ def get_atx_heading(line: str,
if i > md_parser['github']['header']['max space indentation']:
continue
# ATX characters.
offset = i
while i < len(subl) and subl[i] == '#' and i <= md_parser['github'][
'header']['max levels'] + offset:
@ -956,21 +958,37 @@ def get_atx_heading(line: str,
current_headers = i - offset
# Include special cases for l endings which should not be
# At this moment GFM is still at version 0.29
# while cmark is at 0.30. There are subtle differences
# such as this one. Assume gitlab is on par with 0.30.
if parser in ['github', 'commonmarker']:
# GFM 0.29 and cmark 0.29.
spaces = [' ']
else:
# cmark 0.30.
spaces = [' ', '\u0009']
# Include special cases for line endings which should not be
# discarded as non-ATX headers.
if i < len(subl) and (subl[i] != ' ' and subl[i] != '\u000a'
and subl[i] != '\u000d'):
if i < len(subl) and subl[i] not in spaces + ['\u000a', '\u000d']:
continue
i += 1
# Exclude leading whitespaces after the ATX header identifier.
while i < len(subl) and subl[i] == ' ':
# [0.29]:
# The opening sequence of `#` characters must be followed
# by a space or by the end of line.
# [0.30]:
# The opening sequence of `#` characters must be followed
# by spaces or tabs, or by the end of line.
while i < len(subl) and subl[i] in spaces:
i += 1
# An algorithm to find the start and the end of the closing sequence (cs).
# The closing sequence includes all the significant part of the
# string. This algorithm has a complexity of O(n) with n being the
# length of the l.
# length of the line.
cs_start = i
cs_end = cs_start
# subl_prime =~ subl'.
@ -982,7 +1000,7 @@ def get_atx_heading(line: str,
hash_round_start = i
# Ignore all characters after newlines and carrage returns which
# are not at the end of the l.
# are not at the end of the line.
# See the two CRLF marker tests.
crlf_marker = 0
stripped_crlf = False
@ -1004,7 +1022,7 @@ def get_atx_heading(line: str,
# Cut spaces and hashes.
while go and i < len_subl - cs_start:
if (subl_prime[i] not in [' ', '#']
if (subl_prime[i] not in spaces + ['#']
or hash_char_rounds > 1):
if i > hash_round_start and hash_char_rounds > 0:
cs_end = len_subl - hash_round_start
@ -1012,7 +1030,7 @@ def get_atx_heading(line: str,
cs_end = len_subl - i
go = False
if go:
while subl_prime[i] == ' ':
while subl_prime[i] in spaces:
i += 1
hash_round_start = i
while subl_prime[i] == '#':

View File

@ -65,6 +65,7 @@ class _cmarkCmarkReferenceMap:
class _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
def __init__(self):
# cmark_strbuf
@ -121,6 +122,8 @@ class _cmarkCmarkNode:
class _cmarkCmarkChunk:
r"""See chunk.h file."""
# license E applies here. See docs/copyright_license.rst
def __init__(self, data: str = None, length: int = 0, alloc: int = 0):
self.data: str = data
self.length: int = length
@ -132,6 +135,8 @@ class _cmarkCmarkChunk:
class _cmarkDelimiter:
r"""A list node with attributes useful for processing emphasis."""
# license C applies here. See docs/copyright_license.rst
def __init__(self, delim_char: str, length: int):
self.previous = None
self.next = None
@ -181,6 +186,8 @@ class _cmarkDelimiter:
class _cmarkSubject:
r"""A double linked list useful for processing emphasis."""
# license C applies here. See docs/copyright_license.rst
def __init__(self):
r"""Define the memory allocation functions to be used by CMark when parsing and allocating a document tree.
@ -271,6 +278,7 @@ class _cmarkSubject:
class _cmarkBracket:
# license C applies here. See docs/copyright_license.rst
def __init__(self):
self.previous = None
self.previous_delimiter = None
@ -286,6 +294,7 @@ class _cmarkBracket:
# 0.29, 0.30
def _cmark_advance(subj: _cmarkSubject):
# license C applies here. See docs/copyright_license.rst.
# Advance the subject. Doesn't check for eof.
subj.pos += 1
@ -293,7 +302,7 @@ def _cmark_advance(subj: _cmarkSubject):
# 0.29, 0.30
def _cmark_cmark_utf8proc_is_space(char: int, parser: str = 'github') -> bool:
r"""Match anything in the Zs class, plus LF, CR, TAB, FF."""
# license C applies here. See docs/copyright_license.rst.
# license D applies here. See docs/copyright_license.rst.
value = False
if chr(char) in md_parser[parser]['pseudo-re']['UWC']:
value = True
@ -317,7 +326,7 @@ def _cmark_cmark_utf8proc_is_punctuation(char: int, parser: str = 'github') -> b
r"""Match anything in the P[cdefios] classes."""
# license C applies here. See docs/copyright_license.rst.
value = False
if (char < 128 and _cmark_cmark_ispunct(char)) or chr(char) in md_parser[parser]['pseudo-re']['PC']:
if (char < 128 and _cmark_cmark_ispunct(char)) or chr(char) in md_parser[parser]['pseudo-re']['UPC']:
value = True
return value
@ -362,7 +371,6 @@ def _cmark_cmark_utf8proc_charlen(line: str, line_length: int) -> int:
# 0.29, 0.30
def _cmark_cmark_utf8proc_iterate(line: str, line_len: int) -> tuple:
# license D applies here. See docs/copyright_license.rst
length: int = 0
uc: int = -1
dst: int = -1
@ -495,6 +503,7 @@ def _cmark_cmark_chunk_dup(ch: _cmarkCmarkChunk, pos: int, length: int) -> str:
# 0.30
def _cmark_cmark_chunk_literal(data: str) -> _cmarkCmarkChunk:
# license E applies here. See docs/copyright_license.rst
length: int
c: _cmarkCmarkChunk
@ -539,7 +548,6 @@ def _cmark_handle_delim(subj: _cmarkSubject, c: str, smart: bool = False) -> _cm
# 0.29, 0.30
def _cmark_peek_char(subj: _cmarkSubject) -> int:
# license C applies here. See docs/copyright_license.rst
# Instead of using assert just raise a ValueError
if subj.pos < subj.input.length and ord(subj.input.data[subj.pos]) == 0:
raise ValueError
@ -553,6 +561,7 @@ def _cmark_peek_char(subj: _cmarkSubject) -> int:
# 0.30
# Unlink a node without adjusting its next, prev, and parent pointers.
def _cmark_S_node_unlink(node: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
if node is None:
return
@ -573,6 +582,7 @@ def _cmark_S_node_unlink(node: _cmarkCmarkNode):
# 0.30
def _cmark_S_free_nodes(e: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
mem = e.mem
next: _cmarkCmarkNode
@ -595,6 +605,7 @@ def _cmark_S_free_nodes(e: _cmarkCmarkNode):
# 0.30
def _cmark_cmark_node_free(node: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
_cmark_S_node_unlink(node)
node.next = None
_cmark_S_free_nodes(node)
@ -683,6 +694,7 @@ def _cmark_remove_emph(subj: _cmarkSubject, opener: _cmarkDelimiter, closer: _cm
# 0.30
def _cmark_cmark_set_cstr(mem, dst: str, src: str) -> int:
# license C applies here. See docs/copyright_license.rst
old: str = dst
length: int
@ -702,6 +714,7 @@ def _cmark_cmark_set_cstr(mem, dst: str, src: str) -> int:
# 0.30
def _cmark_cmark_node_set_literal(node: _cmarkCmarkNode, content: str) -> int:
# license C applies here. See docs/copyright_license.rst
if node is None:
return 0
@ -858,10 +871,10 @@ def _cmark_make_literal(subj: _cmarkSubject, t: int, start_column: int, end_colu
# 0.29, 0.30
def _cmark_make_str(subj: _cmarkSubject, sc: int, ec: int, s: _cmarkCmarkChunk) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
# s = char
# sc = start column
# ec = end cloumn
# license C applies here. See docs/copyright_license.rst
e = _cmark_make_literal(subj, md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT'], sc, ec)
# Realloc with NULL ptr is equal to malloc, so no need to translate
@ -896,7 +909,6 @@ def _cmark_push_bracket(subj: _cmarkSubject, image: bool, inl_text: _cmarkCmarkN
# 0.29, 0.30
def _cmark_subject_find_special_char(subj: _cmarkSubject, options: int) -> int:
# license C applies here. See docs/copyright_license.rst
# "\r\n\\`&_*[]<!"
SPECIAL_CHARS: list = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -970,6 +982,7 @@ def _cmark_subject_from_buf(mem, line_number: int,
# 0.30
def _cmark_S_is_line_end_char(c: str) -> bool:
# license C applies here. See docs/copyright_license.rst
return c == '\n' or c == '\r'
@ -980,6 +993,7 @@ def _cmark_S_is_line_end_char(c: str) -> bool:
# int cmark_isspace(char c) { return cmark_ctype_class[(uint8_t)c] == 1; }
# The only defined whitespaces in the spec are Unicode whitespaces.
def _cmark_cmark_chunk_rtrim(c: _cmarkCmarkChunk):
# license E applies here. See docs/copyright_license.rst
while c.length > 0:
# if (!cmark_isspace(c->data[c->len - 1]))
if not c.data[c.length - 1] in md_parser['cmark']['pseudo-re']['UWC']:
@ -1038,6 +1052,7 @@ def _cmark_parse_inline(subj: _cmarkSubject, parent: _cmarkCmarkNode, options: i
# 0.30
def _cmark_pop_bracket(subj: _cmarkSubject):
# license C applies here. See docs/copyright_license.rst
b: _cmarkBracket
if subj.last_bracket is None:
return
@ -1052,6 +1067,7 @@ def _cmark_pop_bracket(subj: _cmarkSubject):
def _cmark_cmark_parse_inlines(mem, parent: _cmarkCmarkNode,
refmap: _cmarkCmarkReferenceMap, options: int) -> list:
r"""Get the ignore list."""
# license C applies here. See docs/copyright_license.rst
subj: _cmarkSubject
content: _cmarkCmarkChunk = _cmarkCmarkChunk(parent.data, parent.length)
subj = _cmarkSubject()

View File

@ -922,10 +922,10 @@ parser['cmark']['pseudo-re'] = {
],
}
# Punctuation character.
# Unicode punctuation character.
# Removed parser['cmark']['pseudo-re']['APC'] because check is done
# manually in the md_toc.cmark._cmark_cmark_utf8proc_is_punctuation function.
parser['cmark']['pseudo-re']['PC'] = (
parser['cmark']['pseudo-re']['UPC'] = (
parser['cmark']['pseudo-re']['PGUCPC']
+ parser['cmark']['pseudo-re']['PGUCPD']
+ parser['cmark']['pseudo-re']['PGUCPF']

View File

@ -48,6 +48,16 @@ S10 = 10 * ' '
S18 = 18 * ' '
S21 = 21 * ' '
# Tabs.
T1 = 1 * '\u0009'
T2 = 2 * '\u0009'
T3 = 3 * '\u0009'
T4 = 4 * '\u0009'
T5 = 5 * '\u0009'
T10 = 10 * '\u0009'
T18 = 18 * '\u0009'
T21 = 21 * '\u0009'
# ATX headers.
H1 = 1 * '#'
H2 = 2 * '#'
@ -1345,21 +1355,62 @@ class TestApi(unittest.TestCase):
self.assertEqual(
api.get_atx_heading(H1 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H1 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H1 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H2 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 2, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H2 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H2 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 2, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H3 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 3, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H3 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H3 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 3, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H4 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 4, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H4 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H4 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 4, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H5 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 5, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H5 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H5 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 5, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H6 + S1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': 6, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H6 + T1 + CMARK_LINE_FOO, m_github, 'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H6 + T1 + CMARK_LINE_FOO, m_github, 'cmark'),
[{'header type': 6, 'header text trimmed': CMARK_LINE_FOO, }])
# Example 33 [Commonmark 0.28].
# Example 33 [Commonmark 0.29].
@ -1411,6 +1462,14 @@ class TestApi(unittest.TestCase):
api.get_atx_heading(H1 + S18 + CMARK_LINE_FOO + S21, m_github,
'github'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H1 + T18 + CMARK_LINE_FOO + T21, m_github,
'github'),
[{'header type': None, 'header text trimmed': None, }])
self.assertEqual(
api.get_atx_heading(H1 + T18 + CMARK_LINE_FOO + T21, m_github,
'cmark'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
# Example 38 [Commonmark 0.28].
# Example 38 [Commonmark 0.29].
@ -1480,6 +1539,14 @@ class TestApi(unittest.TestCase):
api.get_atx_heading(H3 + S1 + CMARK_LINE_FOO + S1 + H3 + S5,
m_github, 'github'),
[{'header type': 3, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H3 + S1 + CMARK_LINE_FOO + S1 + H3 + T5,
m_github, 'github'),
[{'header type': 3, 'header text trimmed': CMARK_LINE_FOO + S1 + H3 + T5, }])
self.assertEqual(
api.get_atx_heading(H3 + S1 + CMARK_LINE_FOO + S1 + H3 + T5,
m_github, 'cmark'),
[{'header type': 3, 'header text trimmed': CMARK_LINE_FOO, }])
# Example 44 [Commonmark 0.28].
# Example 44 [Commonmark 0.29].
@ -1497,6 +1564,18 @@ class TestApi(unittest.TestCase):
api.get_atx_heading(H1 + S1 + CMARK_LINE_FOO + H1, m_github,
'github'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO + H1, }])
self.assertEqual(
api.get_atx_heading(H1 + S1 + CMARK_LINE_FOO + S1 + H1, m_github,
'github'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
self.assertEqual(
api.get_atx_heading(H1 + S1 + CMARK_LINE_FOO + T1 + H1, m_github,
'github'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO + T1 + H1, }])
self.assertEqual(
api.get_atx_heading(H1 + S1 + CMARK_LINE_FOO + T1 + H1, m_github,
'cmark'),
[{'header type': 1, 'header text trimmed': CMARK_LINE_FOO, }])
# Example 46 [Commonmark 0.28].
# Example 46 [Commonmark 0.29].