Separated cmark code in different modules according to original C source code.

This commit is contained in:
Franco Masotti 2022-04-02 17:31:15 +02:00
parent 254eea4886
commit a5bbc275dd
Signed by: frnmst
GPG Key ID: 24116ED85666780A
12 changed files with 1110 additions and 866 deletions

View File

@ -28,7 +28,7 @@ import sys
import fpyutils
from . import generic
from .cmark import cmark
from .cmark import inlines_c, node_h
from .constants import common_defaults
from .constants import parser as md_parser
from .exceptions import (CannotTreatUnicodeString, GithubEmptyLinkLabel,
@ -712,14 +712,14 @@ def remove_emphasis(line: str, parser: str = 'github') -> list:
mem = None
refmap = None
parent = cmark._cmarkCmarkNode()
parent = node_h._cmarkCmarkNode()
parent.data = line
parent.length = len(line)
parent.start_line = 0
parent.start_column = 0
parent.internal_offset = 1
ignore = cmark._cmark_cmark_parse_inlines(mem, parent, refmap, 0)
ignore = inlines_c._cmark_cmark_parse_inlines(mem, parent, refmap, 0)
line = filter_indices_from_line(line, ignore)
elif parser in ['redcarpet']:

21
md_toc/cmark/__init__.py Normal file
View File

@ -0,0 +1,21 @@
#
# __init__.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
"""Python discovery file."""

145
md_toc/cmark/buffer_c.py Normal file
View File

@ -0,0 +1,145 @@
#
# buffer_c.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""A cmark implementation file."""
import copy
import sys
from .buffer_h import _cmark_CMARK_BUF_INIT, _cmarkCmarkStrbuf
from .cmark_h import _cmarkCmarkMem
# 0.29, 0.30
def _cmark_cmark_strbuf_init(mem: _cmarkCmarkMem, buf: _cmarkCmarkStrbuf, initial_size: int):
buf.mem = mem
buf.asize = 0
buf.size = 0
buf.ptr = str()
if initial_size > 0:
_cmark_cmark_strbuf_grow(buf, initial_size)
# 0.29, 0.30
def _cmark_cmark_strbuf_grow(buf: _cmarkCmarkStrbuf, target_size: int):
# Instead of using assert just raise a ValueError
if target_size <= 0:
raise ValueError
if target_size < buf.asize:
return
# Usually it is this value and it is defined in stdint.h.
INT32_MAX = (2 << 30) - 1
# Truncate number to a length of 30 bits.
target_size &= INT32_MAX
if target_size > INT32_MAX / 2:
print("[cmark] _cmark_cmark_strbuf_grow requests buffer with size > " + str(INT32_MAX / 2) + ", aborting")
sys.exit(1)
# Oversize the buffer by 50% to guarantee amortized linear time
# complexity on append operations.
# See also
# https://codeyarns.com/tech/2019-03-06-integer-division-in-c.html
# for the integer division.
new_size: int = target_size + int(target_size / 2)
new_size += 1
new_size = (new_size + 7) & ~7
# No need to malloc.
# if buf.asize:
# buf->ptr = buf->mem->realloc(buf->ptr, new_size);
# else:
# buf->ptr = buf->mem->malloc(newsize)
buf.asize = new_size
# 0.29, 0.30
def _cmark_cmark_strbuf_clear(buf: _cmarkCmarkStrbuf):
buf.size = 0
if buf.asize > 0:
buf.ptr = str()
# 0.29, 0.30
def _cmark_cmark_strbuf_set(buf: _cmarkCmarkStrbuf, data: str, length: int):
if length <= 0 or data is None:
_cmark_cmark_strbuf_clear(buf)
else:
if data != buf.ptr:
if length >= buf.asize:
_cmark_cmark_strbuf_grow(buf, length)
# alternative to
# memmove(buf->ptr, data, len)
buf.ptr = copy.deepcopy(data[0:length])
buf.size = length
# No need to set termination character
# buf.ptr[buf.size] = '\0'
# 0.29, 0.30
def _cmark_cmark_strbuf_detach(buf: _cmarkCmarkStrbuf) -> str:
data: str = buf.ptr
if buf.asize == 0:
# return an empty string
# return (unsigned char *)buf->mem->calloc(1, 1);
return str()
_cmark_cmark_strbuf_init(buf.mem, buf, 0)
return data
# 0.29, 0.30
def _cmark_cmark_strbuf_truncate(buf: _cmarkCmarkStrbuf, len: int):
if len < 0:
len = 0
if len < buf.size:
buf.size = len
# No need for the terminator character.
# buf.ptr[buf.size] = '\0'
# 0.29, 0.30
def _cmark_cmark_strbuf_drop(buf: _cmarkCmarkStrbuf, n: int):
if n > 0:
if n > buf.size:
n = buf.size
buf.size = buf.size - n
if buf.size:
# Alternative to
# memmove(buf->ptr, buf->ptr + n, buf->size);
buf.ptr = copy.deepcopy(buf.ptr[n:buf.size])
# No need for the terminator character.
# buf->ptr[buf->size] = '\0';
if __name__ == '__main__':
pass

46
md_toc/cmark/buffer_h.py Normal file
View File

@ -0,0 +1,46 @@
#
# buffer_h.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""A cmark implementation file."""
from .cmark_h import _cmarkCmarkMem
class _cmarkCmarkStrbuf:
def __init__(self):
self.mem: _cmarkCmarkMem = None
self.ptr: str = str()
self.asize: int = 0
self.size: int = 0
# Should be equivalent to
# #define CMARK_BUF_INIT(mem) \
# { mem, cmark_strbuf__initbuf, 0, 0 }
# 0.29, 0.30
def _cmark_CMARK_BUF_INIT(mem):
b = _cmarkCmarkStrbuf()
b.mem = mem
return b
if __name__ == '__main__':
pass

71
md_toc/cmark/chunk_h.py Normal file
View File

@ -0,0 +1,71 @@
#
# chunk_h.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""The cmark implementation file."""
import copy
from ..constants import parser as md_parser
class _cmarkCmarkChunk:
r"""See chunk.h file."""
# license E applies here. See docs/copyright_license.rst
def __init__(self, data: str = None, length: int = 0, alloc: int = 0):
self.data: str = data
self.length: int = length
# Returns 1 if c is a "whitespace" character as defined by the spec.
# int cmark_isspace(char c) { return cmark_ctype_class[(uint8_t)c] == 1; }
# The only defined whitespaces in the spec are Unicode whitespaces.
# 0.30
def _cmark_cmark_chunk_rtrim(c: _cmarkCmarkChunk):
# license E applies here. See docs/copyright_license.rst
while c.length > 0:
# if (!cmark_isspace(c->data[c->len - 1]))
if not c.data[c.length - 1] in md_parser['cmark']['pseudo-re']['UWC']:
break
c.length -= 1
# 0.30
def _cmark_cmark_chunk_literal(data: str) -> _cmarkCmarkChunk:
# license E applies here. See docs/copyright_license.rst
length: int
c: _cmarkCmarkChunk
if data is not None:
length = len(data)
else:
length = 0
c = _cmarkCmarkChunk(data, length)
return c
# 0.29, 0.30
def _cmark_cmark_chunk_dup(ch: _cmarkCmarkChunk, pos: int, length: int) -> str:
# license E applies here. See docs/copyright_license.rst
c = _cmarkCmarkChunk(copy.deepcopy(ch.data[pos: pos + length]), length)
return c

View File

@ -0,0 +1,34 @@
#
# cmark_ctype_c.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""The cmark implementation file."""
from ..constants import parser as md_parser
# 0.29, 0.30
def _cmark_cmark_ispunct(char: int, parser: str = 'github') -> bool:
r"""Return True if c is an ascii punctuation character."""
# license C applies here. See docs/copyright_license.rst.
value = False
if chr(char) in md_parser[parser]['pseudo-re']['APC']:
value = True
return value

37
md_toc/cmark/cmark_h.py Normal file
View File

@ -0,0 +1,37 @@
#
# cmark_h.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""A cmark implementation file."""
# /** Defines the memory allocation functions to be used by CMark
# * when parsing and allocating a document tree
# */
# typedef struct cmark_mem {
# void *(*calloc)(size_t, size_t);
# void *(*realloc)(void *, size_t);
# void (*free)(void *);
# } cmark_mem;
class _cmarkCmarkMem:
pass
if __name__ == '__main__':
pass

View File

@ -0,0 +1,55 @@
#
# cmark_reference_h.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
# This file is part of md-toc.
#
# md-toc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# md-toc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
r"""The cmark implementation file."""
from ..generic import _noop
class _cmarkCmarkReference:
def __init__(self):
next = None
label: str = None
url: str = None
title: str = None
age: int = 0
size: int = 0
_noop(next)
_noop(label)
_noop(url)
_noop(title)
_noop(age)
_noop(size)
class _cmarkCmarkReferenceMap:
def __init__(self):
mem = None
refs: _cmarkCmarkReference
sorted: _cmarkCmarkReference
size: int = 0
ref_size: int = 0
max_ref_size: int = 0
_noop(mem)
_noop(size)
_noop(ref_size)
_noop(max_ref_size)

View File

@ -1,5 +1,5 @@
#
# cmark.py
# inlines_c.py
#
# Copyright (C) 2017-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com)
#
@ -18,114 +18,30 @@
# You should have received a copy of the GNU General Public License
# along with md-toc. If not, see <http://www.gnu.org/licenses/>.
#
"""The cmark implementation file."""
r"""A cmark implementation file."""
import copy
import sys
from ..constants import parser as md_parser
from ..exceptions import CannotTreatUnicodeString
from ..generic import _noop, _replace_substring
from .buffer_c import (_cmark_cmark_strbuf_detach, _cmark_cmark_strbuf_drop,
_cmark_cmark_strbuf_set, _cmark_cmark_strbuf_truncate)
from .buffer_h import _cmark_CMARK_BUF_INIT, _cmarkCmarkStrbuf
from .chunk_h import (_cmark_cmark_chunk_dup, _cmark_cmark_chunk_literal,
_cmark_cmark_chunk_rtrim, _cmarkCmarkChunk)
from .cmark_ctype_c import _cmark_cmark_ispunct
from .cmark_reference_h import _cmarkCmarkReferenceMap
from .node_c import _cmark_cmark_node_free, _cmark_cmark_node_set_literal
from .node_h import _cmarkCmarkNode
from .utf8_c import (_cmark_cmark_utf8proc_is_punctuation,
_cmark_cmark_utf8proc_is_space,
_cmark_cmark_utf8proc_iterate)
class _cmarkCmarkReference:
def __init__(self):
next = None
label: str = None
url: str = None
title: str = None
age: int = 0
size: int = 0
_noop(next)
_noop(label)
_noop(url)
_noop(title)
_noop(age)
_noop(size)
class _cmarkCmarkReferenceMap:
def __init__(self):
mem = None
refs: _cmarkCmarkReference
sorted: _cmarkCmarkReference
size: int = 0
ref_size: int = 0
max_ref_size: int = 0
_noop(mem)
_noop(size)
_noop(ref_size)
_noop(max_ref_size)
class _cmarkCmarkNode:
# 0.29, 0.30
def _cmark_make_linebreak(mem):
# license C applies here. See docs/copyright_license.rst
def __init__(self):
# cmark_strbuf
# /** Defines the memory allocation functions to be used by CMark
# * when parsing and allocating a document tree
# */
# typedef struct cmark_mem {
# void *(*calloc)(size_t, size_t);
# void *(*realloc)(void *, size_t);
# void (*free)(void *);
# } cmark_mem;
self.mem = None
self.type = None
# Main.
self.data = None
self.length = 0
self.prev = None
self.next = None
self.parent = None
self.first_child = None
self.last_child = None
self.user_data = None
self.start_line = 0
self.start_column = 0
self.end_line = 0
self.end_column = 0
self.internal_offset = 0
# Add a new variable.
self.numdelims: int = 0
def append_child(self):
pass
def append_child_lite(self, child):
old_last_child: _cmarkCmarkNode = self.last_child
child.next = None
child.prev = old_last_child
child.parent = self
self.last_child = child
if old_last_child:
old_last_child.next = child
else:
# Also set first_child if node previously had no children.
self.first_child = child
class _cmarkCmarkChunk:
r"""See chunk.h file."""
# license E applies here. See docs/copyright_license.rst
def __init__(self, data: str = None, length: int = 0, alloc: int = 0):
self.data: str = data
self.length: int = length
# also implies a NULL-terminated string
self.alloc: int = alloc
_cmark_make_simple(mem, md_parser['cmark']['cmark_node_type']['CMARK_NODE_LINEBREAK'])
class _cmarkDelimiter:
@ -179,20 +95,28 @@ class _cmarkDelimiter:
+ cc + '\n')
class _cmarkBracket:
# license C applies here. See docs/copyright_license.rst
def __init__(self):
self.previous = None
self.previous_delimiter = None
# _cmarkCmarkNode
self.inl_text = None
self.position = 0
self.image = False
self.active = True
self.bracket_after = False
class _cmarkSubject:
r"""A double linked list useful for processing emphasis."""
# license C applies here. See docs/copyright_license.rst
def __init__(self):
r"""Define the memory allocation functions to be used by CMark when parsing and allocating a document tree.
typedef struct cmark_mem {
void *(*calloc)(size_t, size_t);
void *(*realloc)(void *, size_t);
void (*free)(void *);
} cmark_mem;
"""
# cmark_mem
self.mem = None
self.line = 0
@ -273,50 +197,113 @@ class _cmarkSubject:
x = x.next
class _cmarkBracket:
# 0.30
def _cmark_S_is_line_end_char(c: str) -> bool:
# license C applies here. See docs/copyright_license.rst
def __init__(self):
self.previous = None
self.previous_delimiter = None
# _cmarkCmarkNode
self.inl_text = None
self.position = 0
self.image = False
self.active = True
self.bracket_after = False
return c == '\n' or c == '\r'
# /** Defines the memory allocation functions to be used by CMark
# * when parsing and allocating a document tree
# */
# typedef struct cmark_mem {
# void *(*calloc)(size_t, size_t);
# void *(*realloc)(void *, size_t);
# void (*free)(void *);
# } cmark_mem;
class _cmarkCmarkMem:
pass
class _cmarkCmarkStrbuf:
def __init__(self):
self.mem: _cmarkCmarkMem = None
self.ptr: str = str()
self.asize: int = 0
self.size: int = 0
# Should be equivalent to
# #define CMARK_BUF_INIT(mem) \
# { mem, cmark_strbuf__initbuf, 0, 0 }
# 0.29, 0.30
def _cmark_CMARK_BUF_INIT(mem):
b = _cmarkCmarkStrbuf()
b.mem = mem
def _cmark_make_literal(subj: _cmarkSubject, t: int, start_column: int, end_column: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
r"""Create an inline with a literal string value."""
e = _cmarkCmarkNode()
return b
# cmark_strbuf_init(subj->mem, &e->content, 0)
e.mem = copy.deepcopy(subj.mem)
e.type = t
e.start_line = e.end_line = subj.line
# columns are NOT 1 based.
e.start_column: int = start_column + subj.column_offset + subj.block_offset
e.end_column: int = end_column + subj.column_offset + subj.block_offset
return e
# 0.29, 0.30
def _cmark_make_simple(mem, t: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
e = _cmarkCmarkNode()
e.mem = copy.deepcopy(mem)
e.type = t
return e
# 0.29, 0.30
def _cmark_make_str(subj: _cmarkSubject, sc: int, ec: int, s: _cmarkCmarkChunk) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
# s = char
# sc = start column
# ec = end cloumn
e = _cmark_make_literal(subj, md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT'], sc, ec)
# Realloc with NULL ptr is equal to malloc, so no need to translate
# this operation:
# e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
if s.data is not None:
e.data = copy.deepcopy(s.data)
# No need to add line terminator (\0).
e.length = s.length
return e
# 0.29, 0.30
def _cmark_subject_from_buf(mem, line_number: int,
block_offset: int, e: _cmarkSubject, chunk: _cmarkCmarkChunk,
refmap: _cmarkCmarkReferenceMap):
# license C applies here. See docs/copyright_license.rst
i: int
e.mem = mem
e.input = chunk
e.line = line_number
e.pos = 0
e.block_offset = block_offset
e.column_offset = 0
e.refmap = refmap
e.last_delim = None
e.last_bracket = None
for i in range(0, md_parser['cmark']['generic']['MAXBACKTICKS']):
e.backticks[i] = 0
e.scanned_for_backticks = False
# 0.30
def _cmark_isbacktick(c: int) -> int:
backtick: int = 0
if chr(c) == '`':
backtick = 1
return backtick
# 0.29, 0.30
def _cmark_peek_char(subj: _cmarkSubject) -> int:
# license C applies here. See docs/copyright_license.rst
# Instead of using assert just raise a ValueError
if subj.pos < subj.input.length and ord(subj.input.data[subj.pos]) == 0:
raise ValueError
if subj.pos < subj.input.length:
return ord(subj.input.data[subj.pos])
else:
return 0
# 0.29, 0.30
def _cmark_peek_at(subj: _cmarkSubject, pos: int) -> int:
# license C applies here. See docs/copyright_license.rst
return ord(subj.input.data[pos])
# 0.30
def _cmark_is_eof(subj: _cmarkSubject):
r"""Return true if there are more characters in the subject."""
# license C applies here. See docs/copyright_license.rst.
return subj.pos >= subj.input.length
# 0.29, 0.30
@ -327,113 +314,186 @@ def _cmark_advance(subj: _cmarkSubject):
# 0.29, 0.30
def _cmark_cmark_utf8proc_is_space(char: int, parser: str = 'github') -> bool:
r"""Match anything in the Zs class, plus LF, CR, TAB, FF."""
# license D applies here. See docs/copyright_license.rst.
value = False
if chr(char) in md_parser[parser]['pseudo-re']['UWC']:
value = True
def _cmark_skip_line_end(subj: _cmarkSubject) -> bool:
# license C applies here. See docs/copyright_license.rst
seen_line_end_char: bool = False
return value
if _cmark_peek_char(subj) == '\r':
_cmark_advance(subj)
seen_line_end_char = True
if _cmark_peek_char(subj) == '\n':
_cmark_advance(subj)
seen_line_end_char = True
return seen_line_end_char or _cmark_is_eof(subj)
# Take characters while a predicate holds, and return a string.
# 0.29, 0.30
def _cmark_cmark_ispunct(char: int, parser: str = 'github') -> bool:
r"""Return True if c is an ascii punctuation character."""
# license C applies here. See docs/copyright_license.rst.
value = False
if chr(char) in md_parser[parser]['pseudo-re']['APC']:
value = True
def _cmark_take_while(subj: _cmarkSubject) -> _cmarkCmarkChunk:
r"""Get backtick spanning."""
c: int
startpos: int = subj.pos
len: int = 0
return value
c = _cmark_peek_char(subj)
while _cmark_isbacktick(c):
_cmark_advance(subj)
len += 1
c = _cmark_peek_char(subj)
return _cmark_cmark_chunk_dup(subj.input, startpos, len)
# Return the number of newlines in a given span of text in a subject. If
# the number is greater than zero, also return the number of characters
# between the last newline and the end of the span in `since_newline`.
# 0.29, 0.30
def _cmark_cmark_utf8proc_is_punctuation(char: int, parser: str = 'github') -> bool:
r"""Match anything in the P[cdefios] classes."""
# license C applies here. See docs/copyright_license.rst.
value = False
if (char < 128 and _cmark_cmark_ispunct(char)) or chr(char) in md_parser[parser]['pseudo-re']['UPC']:
value = True
def _cmark_count_newlines(subj: _cmarkSubject, start: int, length: int) -> tuple:
nls: int = 0
since_nl: int = 0
return value
while length > 0:
if subj.input.data[start] == '\n':
nls += 1
since_nl = 0
else:
since_nl += 1
start += 1
length -= 1
# 0.29, 0.30
def _cmark_cmark_utf8proc_charlen(line: str, line_length: int) -> int:
# license D applies here. See docs/copyright_license.rst
length: int
i: int
if not line_length:
if not nls:
return 0
# Use length = 1 instead of the utf8proc_utf8class[256]
# list.
# For example:
# len('ł') == 2 # in Python 2
# len('ł') == 1 # in Python 3
# See the documentation.
# In Python 3 since all strings are unicode by default
# they all have length of 1.
length = 1
if len(line) > 1:
# See
# https://docs.python.org/3/howto/unicode.html#comparing-strings
raise CannotTreatUnicodeString
if not length:
return -1
if line_length >= 0 and length > line_length:
return -line_length
for i in range(1, length):
if (ord(line[i]) & 0xC0) != 0x80:
return -i
return length
since_newline = since_nl
return nls, since_newline
# Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and
# `column_offset` according to the number of newlines in a just-matched span
# of text in `subj`.
# 0.29, 0.30
def _cmark_cmark_utf8proc_iterate(line: str, line_len: int) -> tuple:
# license D applies here. See docs/copyright_license.rst
length: int = 0
uc: int = -1
dst: int = -1
def _cmark_adjust_subj_node_newlines(subj: _cmarkSubject, node: _cmarkCmarkNode, matchlen: int, extra: int, options: int):
if not options & md_parser['cmark']['generic']['CMARK_OPT_SOURCEPOS']:
return
length = _cmark_cmark_utf8proc_charlen(line, line_len)
if length < 0:
return -1, dst
newlines: int
since_newline: int
if length == 1:
uc = ord(line[0])
elif length == 2:
uc = ((ord(line[0]) & 0x1F) << 6) + (ord(line[1]) & 0x3F)
if uc < 0x80:
uc = -1
elif length == 3:
uc = ((ord(line[0]) & 0x0F) << 12) + ((ord(line[1]) & 0x3F) << 6) + (ord(line[2]) & 0x3F)
if uc < 0x800 or (uc >= 0xD800 and uc < 0xE000):
uc = -1
elif length == 4:
uc = (((ord(line[0]) & 0x07) << 18) + ((ord(line[1]) & 0x3F) << 12) +
((ord(line[2]) & 0x3F) << 6) + (ord(line[3]) & 0x3F))
if uc < 0x10000 or uc >= 0x110000:
uc = -1
if uc < 0:
return -1, dst
dst = uc
return length, dst
newlines, since_newline = _cmark_count_newlines(subj, subj.pos - matchlen - extra, matchlen)
if newlines:
subj.line += newlines
node.end_line += newlines
node.end_column = since_newline
subj.column_offset = - subj.pos + since_newline + extra
# Try to process a backtick code span that began with a
# span of ticks of length openticklength length (already
# parsed). Return 0 if you don't find matching closing
# backticks, otherwise return the position in the subject
# after the closing backticks.
# 0.29, 0.30
def _cmark_peek_at(subj: _cmarkSubject, pos: int) -> int:
# license C applies here. See docs/copyright_license.rst
return ord(subj.input.data[pos])
def _cmark_scan_to_closing_backticks(subj: _cmarkSubject, openticklength: int) -> int:
found: bool = False
if openticklength > md_parser['cmark']['generic']['MAXBACKTICKS']:
# we limit backtick string length because of the array subj->backticks:
return 0
if (subj.scanned_for_backticks and
subj.backticks[openticklength] <= subj.pos):
# return if we already know there's no closer
return 0
while not found:
# read non backticks
c: int
c = _cmark_peek_char(subj)
while not _cmark_isbacktick(c):
_cmark_advance(subj)
c = _cmark_peek_char(subj)
if _cmark_is_eof(subj):
break
numticks: int = 0
while _cmark_isbacktick(_cmark_peek_char(subj)):
_cmark_advance(subj)
numticks += 1
# store position of ender
# Ender starting point.
if numticks <= md_parser['cmark']['generic']['MAXBACKTICKS']:
subj.backticks[numticks] = subj.pos - numticks
if numticks == openticklength:
return subj.pos
# got through whole input without finding closer
subj.scanned_for_backticks = True
return 0
# Destructively modify string, converting newlines to
# spaces, then removing a single leading + trailing space,
# unless the code span consists entirely of space characters.
# 0.29, 0.30
def _cmark_S_normalize_code(s: _cmarkCmarkStrbuf):
r: int = 0
w: int = 0
contains_nonspace: bool = False
while r < s.size:
if s.ptr[r] == '\r':
if (s.ptr[r + 1] != '\n'):
s.ptr = _replace_substring(s.ptr, ' ', w, w)
w += 1
elif s.ptr[r] == '\n':
s.ptr = _replace_substring(s.ptr, ' ', w, w)
w += 1
else:
s.ptr = _replace_substring(s.ptr, s.ptr[r], w, w)
w += 1
if s.ptr[r] != ' ':
contains_nonspace = True
r += 1
# begins and ends with space?
if (contains_nonspace
and s.ptr[0] == ' '
and s.ptr[w - 1] == ' '):
_cmark_cmark_strbuf_drop(s, 1)
_cmark_cmark_strbuf_truncate(s, w - 2)
else:
_cmark_cmark_strbuf_truncate(s, w)
# Parse backtick code section or raw backticks, return an inline.
# Assumes that the subject has a backtick at the current position.
# 0.29, 0.30
def _cmark_handle_backticks(subj: _cmarkSubject, options: int) -> _cmarkCmarkNode:
openticks: _cmarkCmarkChunk = _cmark_take_while(subj)
startpos: int = subj.pos
endpos: int = _cmark_scan_to_closing_backticks(subj, openticks.length)
# not found
if endpos == 0:
# rewind
subj.pos = startpos
return _cmark_make_str(subj, subj.pos, subj.pos, openticks)
else:
buf = _cmark_CMARK_BUF_INIT(subj.mem)
_cmark_cmark_strbuf_set(buf, subj.input.data[startpos:], endpos - startpos - openticks.length)
_cmark_S_normalize_code(buf)
node: _cmarkCmarkNode = _cmark_make_literal(subj, md_parser['cmark']['cmark_node_type']['CMARK_NODE_CODE'], startpos, endpos - openticks.length - 1)
node.len = buf.size
node.data = _cmark_cmark_strbuf_detach(buf)
_cmark_adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.length, options)
return node
# 0.29, 0.30
@ -509,6 +569,19 @@ def _cmark_scan_delims(subj: _cmarkSubject, c: str) -> tuple:
return numdelims, can_open, can_close
# 0.30
def _cmark_pop_bracket(subj: _cmarkSubject):
# license C applies here. See docs/copyright_license.rst
b: _cmarkBracket
if subj.last_bracket is None:
return
b = subj.last_bracket
subj.last_bracket = subj.last_bracket.previous
# No need to free.
# subj->mem->free(b);
_noop(b)
# 0.29, 0.30
def _cmark_push_delimiter(subj: _cmarkSubject, c: str, can_open: bool,
can_close: bool, inl_text: _cmarkCmarkNode):
@ -522,27 +595,22 @@ def _cmark_push_delimiter(subj: _cmarkSubject, c: str, can_open: bool,
# 0.29, 0.30
def _cmark_cmark_chunk_dup(ch: _cmarkCmarkChunk, pos: int, length: int) -> str:
# license E applies here. See docs/copyright_license.rst
c = _cmarkCmarkChunk(copy.deepcopy(ch.data[pos: pos + length]), length)
return c
# 0.30
def _cmark_cmark_chunk_literal(data: str) -> _cmarkCmarkChunk:
# license E applies here. See docs/copyright_license.rst
length: int
c: _cmarkCmarkChunk
if data is not None:
length = len(data)
else:
length = 0
c = _cmarkCmarkChunk(data, length)
return c
def _cmark_push_bracket(subj: _cmarkSubject, image: bool, inl_text: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
b = _cmarkBracket()
if subj.last_bracket is not None:
subj.last_bracket.bracket_after = True
b.image = image
b.active = True
b.inl_text = inl_text
b.previous = subj.last_bracket
b.previous_delimiter = subj.last_delim
b.position = subj.pos
b.bracket_after = False
subj.last_bracket = b
# Assumes the subject has a c at the current position.
# 0.29, 0.30
def _cmark_handle_delim(subj: _cmarkSubject, c: str, smart: bool = False) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
@ -573,69 +641,79 @@ def _cmark_handle_delim(subj: _cmarkSubject, c: str, smart: bool = False) -> _cm
# 0.29, 0.30
def _cmark_peek_char(subj: _cmarkSubject) -> int:
def _cmark_process_emphasis(subj: _cmarkSubject, stack_bottom: _cmarkDelimiter, ignore: list) -> list:
# license C applies here. See docs/copyright_license.rst
# Instead of using assert just raise a ValueError
if subj.pos < subj.input.length and ord(subj.input.data[subj.pos]) == 0:
raise ValueError
closer: _cmarkDelimiter = subj.last_delim
opener: _cmarkDelimiter
openers_bottom_index: int = 0
opener_found: bool
openers_bottom_index: int = 0
openers_bottom: list = [stack_bottom, stack_bottom, stack_bottom, stack_bottom, stack_bottom, stack_bottom]
if subj.pos < subj.input.length:
return ord(subj.input.data[subj.pos])
else:
return 0
# move back to first relevant delim.
while closer is not None and closer.previous is not stack_bottom:
closer = closer.previous
# now move forward, looking for closers, and handling each
while closer is not None:
if closer.can_close:
if closer.delim_char == '"':
openers_bottom_index = 0
elif closer.delim_char == '\'':
openers_bottom_index = 1
elif closer.delim_char == '_':
openers_bottom_index = 2
elif closer.delim_char == '*':
openers_bottom_index = 3 + (closer.length % 3)
else:
raise ValueError
# 0.30
# Unlink a node without adjusting its next, prev, and parent pointers.
def _cmark_S_node_unlink(node: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
if node is None:
return
# Now look backwards for first matching opener:
opener = closer.previous
opener_found = False
while opener is not None and opener != openers_bottom[openers_bottom_index]:
if opener.can_open and opener.delim_char == closer.delim_char:
# interior closer of size 2 can't match opener of size 1
# or of size 1 can't match 2
if (not (closer.can_open or opener.can_close)
or closer.length % 3 == 0
or (opener.length + closer.length) % 3 != 0):
opener_found = True
break
opener = opener.previous
if node.prev:
node.prev.next = node.next
if node.next:
node.next.prev = node.prev
old_closer = closer
if closer.delim_char == '*' or closer.delim_char == '_':
if opener_found:
closer = _cmark_remove_emph(subj, opener, closer, ignore)
else:
closer = closer.next
# Adjust first_child and last_child of parent.
# Start and end pointers.
parent: _cmarkCmarkNode = node.parent
if parent:
if parent.first_child == node:
parent.first_child = node.next
if parent.last_child == node:
parent.last_child = node.prev
elif closer.delim_char == '\'':
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTSINGLEQUOTE'])
if opener_found:
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTSINGLEQUOTE'])
closer = closer.next
elif closer.delim_char == '"':
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTDOUBLEQUOTE'])
if opener_found:
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTDOUBLEQUOTE'])
closer = closer.next
if not opener_found:
# set lower bound for future searches for openers
openers_bottom[openers_bottom_index] = old_closer.previous
if not old_closer.can_open:
# we can remove a closer that can't be an
# opener, once we've seen there's no
# matching opener:
subj.extract(old_closer)
else:
closer = closer.next
# 0.30
def _cmark_S_free_nodes(e: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
mem = e.mem
next: _cmarkCmarkNode
_noop(mem)
while e is not None:
# No need to run free operations.
if e.last_child:
# Splice children into list
e.last_child.next = e.next
e.next = e.first_child
next = e.next
# mem->free(e);
e = next
# 0.30
def _cmark_cmark_node_free(node: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
_cmark_S_node_unlink(node)
node.next = None
_cmark_S_free_nodes(node)
# free all delimiters in list until stack_bottom:
while subj.last_delim is not None and subj.last_delim != stack_bottom:
subj.extract(subj.last_delim)
# 0.29, 0.30
@ -719,150 +797,6 @@ def _cmark_remove_emph(subj: _cmarkSubject, opener: _cmarkDelimiter, closer: _cm
return closer
# 0.30
def _cmark_cmark_set_cstr(mem, dst: str, src: str) -> int:
# license C applies here. See docs/copyright_license.rst
old: str = dst
length: int
_noop(old)
if src and src[0]:
length = len(src)
dst = copy.deepcopy(src)
else:
length = 0
dst = None
# No need to free in Python.
return length
# 0.30
def _cmark_cmark_node_set_literal(node: _cmarkCmarkNode, content: str) -> int:
# license C applies here. See docs/copyright_license.rst
if node is None:
return 0
if (node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_HTML_BLOCK']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_HTML_INLINE']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_CODE']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_CODE_BLOCK']):
length, data = _cmark_cmark_set_cstr(node.mem, content)
node.length = length
node.data = data
return 1
return 0
# 0.29, 0.30
def _cmark_process_emphasis(subj: _cmarkSubject, stack_bottom: _cmarkDelimiter, ignore: list) -> list:
# license C applies here. See docs/copyright_license.rst
closer: _cmarkDelimiter = subj.last_delim
opener: _cmarkDelimiter
openers_bottom_index: int = 0
opener_found: bool
openers_bottom_index: int = 0
openers_bottom: list = [stack_bottom, stack_bottom, stack_bottom, stack_bottom, stack_bottom, stack_bottom]
# move back to first relevant delim.
while closer is not None and closer.previous is not stack_bottom:
closer = closer.previous
# now move forward, looking for closers, and handling each
while closer is not None:
if closer.can_close:
if closer.delim_char == '"':
openers_bottom_index = 0
elif closer.delim_char == '\'':
openers_bottom_index = 1
elif closer.delim_char == '_':
openers_bottom_index = 2
elif closer.delim_char == '*':
openers_bottom_index = 3 + (closer.length % 3)
else:
raise ValueError
# Now look backwards for first matching opener:
opener = closer.previous
opener_found = False
while opener is not None and opener != openers_bottom[openers_bottom_index]:
if opener.can_open and opener.delim_char == closer.delim_char:
# interior closer of size 2 can't match opener of size 1
# or of size 1 can't match 2
if (not (closer.can_open or opener.can_close)
or closer.length % 3 == 0
or (opener.length + closer.length) % 3 != 0):
opener_found = True
break
opener = opener.previous
old_closer = closer
if closer.delim_char == '*' or closer.delim_char == '_':
if opener_found:
closer = _cmark_remove_emph(subj, opener, closer, ignore)
else:
closer = closer.next
elif closer.delim_char == '\'':
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTSINGLEQUOTE'])
if opener_found:
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTSINGLEQUOTE'])
closer = closer.next
elif closer.delim_char == '"':
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTDOUBLEQUOTE'])
if opener_found:
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTDOUBLEQUOTE'])
closer = closer.next
if not opener_found:
# set lower bound for future searches for openers
openers_bottom[openers_bottom_index] = old_closer.previous
if not old_closer.can_open:
# we can remove a closer that can't be an
# opener, once we've seen there's no
# matching opener:
subj.extract(old_closer)
else:
closer = closer.next
# free all delimiters in list until stack_bottom:
while subj.last_delim is not None and subj.last_delim != stack_bottom:
subj.extract(subj.last_delim)
# 0.29, 0.30
def _cmark_skip_line_end(subj: _cmarkSubject) -> bool:
# license C applies here. See docs/copyright_license.rst
seen_line_end_char: bool = False
if _cmark_peek_char(subj) == '\r':
_cmark_advance(subj)
seen_line_end_char = True
if _cmark_peek_char(subj) == '\n':
_cmark_advance(subj)
seen_line_end_char = True
return seen_line_end_char or _cmark_is_eof(subj)
# 0.29, 0.30
def _cmark_make_simple(mem, t: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
e = _cmarkCmarkNode()
e.mem = copy.deepcopy(mem)
e.type = t
return e
# 0.29, 0.30
def _cmark_make_linebreak(mem):
# license C applies here. See docs/copyright_license.rst
_cmark_make_simple(mem, md_parser['cmark']['cmark_node_type']['CMARK_NODE_LINEBREAK'])
# 0.29, 0.30
def _cmark_handle_backslash(subj: _cmarkSubject):
r"""Parse backslash-escape or just a backslash, returning an inline."""
@ -880,59 +814,6 @@ def _cmark_handle_backslash(subj: _cmarkSubject):
return _cmark_make_str(subj, subj.pos - 1, subj.pos - 1, _cmark_cmark_chunk_literal('\\'))
# 0.29, 0.30
def _cmark_make_literal(subj: _cmarkSubject, t: int, start_column: int, end_column: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
r"""Create an inline with a literal string value."""
e = _cmarkCmarkNode()
# cmark_strbuf_init(subj->mem, &e->content, 0)
e.mem = copy.deepcopy(subj.mem)
e.type = t
e.start_line = e.end_line = subj.line
# columns are NOT 1 based.
e.start_column: int = start_column + subj.column_offset + subj.block_offset
e.end_column: int = end_column + subj.column_offset + subj.block_offset
return e
# 0.29, 0.30
def _cmark_make_str(subj: _cmarkSubject, sc: int, ec: int, s: _cmarkCmarkChunk) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
# s = char
# sc = start column
# ec = end cloumn
e = _cmark_make_literal(subj, md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT'], sc, ec)
# Realloc with NULL ptr is equal to malloc, so no need to translate
# this operation:
# e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
if s.data is not None:
e.data = copy.deepcopy(s.data)
# No need to add line terminator (\0).
e.length = s.length
return e
# 0.29, 0.30
def _cmark_push_bracket(subj: _cmarkSubject, image: bool, inl_text: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
b = _cmarkBracket()
if subj.last_bracket is not None:
subj.last_bracket.bracket_after = True
b.image = image
b.active = True
b.inl_text = inl_text
b.previous = subj.last_bracket
b.previous_delimiter = subj.last_delim
b.position = subj.pos
b.bracket_after = False
subj.last_bracket = b
# 0.29, 0.30
def _cmark_subject_find_special_char(subj: _cmarkSubject, options: int) -> int:
# license C applies here. See docs/copyright_license.rst
@ -985,340 +866,8 @@ def _cmark_subject_find_special_char(subj: _cmarkSubject, options: int) -> int:
return subj.input.length
# 0.29, 0.30
def _cmark_subject_from_buf(mem, line_number: int,
block_offset: int, e: _cmarkSubject, chunk: _cmarkCmarkChunk,
refmap: _cmarkCmarkReferenceMap):
# license C applies here. See docs/copyright_license.rst
i: int
e.mem = mem
e.input = chunk
e.line = line_number
e.pos = 0
e.block_offset = block_offset
e.column_offset = 0
e.refmap = refmap
e.last_delim = None
e.last_bracket = None
for i in range(0, md_parser['cmark']['generic']['MAXBACKTICKS']):
e.backticks[i] = 0
e.scanned_for_backticks = False
# 0.30
def _cmark_S_is_line_end_char(c: str) -> bool:
# license C applies here. See docs/copyright_license.rst
return c == '\n' or c == '\r'
# 0.30
# /**
# * Returns 1 if c is a "whitespace" character as defined by the spec.
# */
# int cmark_isspace(char c) { return cmark_ctype_class[(uint8_t)c] == 1; }
# The only defined whitespaces in the spec are Unicode whitespaces.
def _cmark_cmark_chunk_rtrim(c: _cmarkCmarkChunk):
# license E applies here. See docs/copyright_license.rst
while c.length > 0:
# if (!cmark_isspace(c->data[c->len - 1]))
if not c.data[c.length - 1] in md_parser['cmark']['pseudo-re']['UWC']:
break
c.length -= 1
# 0.30
def _cmark_isbacktick(c: int) -> int:
backtick: int = 0
if chr(c) == '`':
backtick = 1
return backtick
# Try to process a backtick code span that began with a
# span of ticks of length openticklength length (already
# parsed). Return 0 if you don't find matching closing
# backticks, otherwise return the position in the subject
# after the closing backticks.
# 0.29, 0.30
def _cmark_scan_to_closing_backticks(subj: _cmarkSubject, openticklength: int) -> int: