Much more adherence to the original cmark code.

This commit is contained in:
Franco Masotti 2022-01-23 19:11:14 +01:00
parent 8a4295a7f7
commit 3c819ea781
Signed by: frnmst
GPG Key ID: 24116ED85666780A
3 changed files with 1278 additions and 145 deletions

View File

@ -694,7 +694,7 @@ def remove_html_tags(line: str, parser: str = 'github') -> str:
def remove_emphasis(line: str, parser: str = 'github') -> list:
r"""Remove emphasis.
r"""Remove markdown emphasis.
:parameter line: a string.
:parameter parser: decides rules on how to find delimiters.
@ -704,22 +704,21 @@ def remove_emphasis(line: str, parser: str = 'github') -> list:
:returns: the input line without emphasis.
:rtype: str
:raises: a built-in exception.
.. note:: Backslashes are preserved.
"""
if parser in ['github', 'cmark', 'gitlab', 'commonmarker', 'goldmark', 'redcarpet']:
mem = None
refmap = None
content = cmark._cmarkCmarkChunk(line, len(line), 0)
subj = cmark._cmark_Subject(input=line)
cmark._cmark_subject_from_buf(mem, 0, 0, subj, content, refmap)
parent = cmark._cmarkCmarkNode()
parent.data = line
parent.length = len(line)
parent.start_line = 0
parent.start_column = 0
parent.internal_offset = 1
while not cmark._cmark_is_eof(subj):
cmark._cmark_parse_inline(subj)
ignore = list()
# When we hit the end of the input, we call the process emphasis procedure (see below), with stack_bottom = NULL.
cmark._cmark_process_emphasis(subj, None, ignore)
ignore = cmark._cmark_cmark_parse_inlines(mem, parent, refmap, 0)
line = filter_indices_from_line(line, ignore)
elif parser in ['redcarpet']:

View File

@ -26,12 +26,65 @@ from .constants import parser as md_parser
from .exceptions import CannotTreatUnicodeString
def _noop(var):
# Black hole for unused variables
# to avoid triggering flake8.
pass
class _cmarkCmarkReference:
def __init__(self):
next = None
label: str = None
url: str = None
title: str = None
age: int = 0
size: int = 0
_noop(next)
_noop(label)
_noop(url)
_noop(title)
_noop(age)
_noop(size)
class _cmarkCmarkReferenceMap:
def __init__(self):
mem = None
refs: _cmarkCmarkReference
sorted: _cmarkCmarkReference
size: int = 0
ref_size: int = 0
max_ref_size: int = 0
_noop(mem)
_noop(size)
_noop(ref_size)
_noop(max_ref_size)
class _cmarkCmarkNode:
def __init__(self):
# cmark_strbuf
self.content = None
# /** Defines the memory allocation functions to be used by CMark
# * when parsing and allocating a document tree
# */
# typedef struct cmark_mem {
# void *(*calloc)(size_t, size_t);
# void *(*realloc)(void *, size_t);
# void (*free)(void *);
# } cmark_mem;
self.mem = None
self.type = None
# Main.
self.data = None
self.length = 0
self.prev = None
self.next = None
self.parent = None
self.first_child = None
self.last_child = None
@ -44,15 +97,39 @@ class _cmarkCmarkNode:
self.end_column = 0
self.internal_offset = 0
# union (cmark_chunk)
self.as_literal = None
self.as_literal_len: int = 0
# Add a new variable.
self.numdelims: int = 0
def append_child(self):
pass
class _cmarkDelimiterDLLNode:
def append_child_lite(self, child):
old_last_child: _cmarkCmarkNode = self.last_child
child.next = None
child.prev = old_last_child
child.parent = self
self.last_child = child
if old_last_child:
old_last_child.next = child
else:
# Also set first_child if node previously had no children.
self.first_child = child
class _cmarkCmarkChunk:
r"""See chunk.h file."""
def __init__(self, data: str = None, length: int = 0, alloc: int = 0):
self.data: str = data
self.length: int = length
# also implies a NULL-terminated string
self.alloc: int = alloc
class _cmarkDelimiter:
r"""A list node with attributes useful for processing emphasis."""
def __init__(self, delim_char: str, length: int):
@ -101,21 +178,10 @@ class _cmarkDelimiterDLLNode:
+ cc + '\n')
class _cmarkCmarkChunk:
r"""See chunk.h file."""
def __init__(self, data, length, alloc):
self.data: str = data
self.length: int = length
# also implies a NULL-terminated string
self.alloc: int = alloc
class _cmark_Subject:
class _cmarkSubject:
r"""A double linked list useful for processing emphasis."""
def __init__(self, input: str):
def __init__(self):
r"""Define the memory allocation functions to be used by CMark when parsing and allocating a document tree.
typedef struct cmark_mem {
@ -136,24 +202,29 @@ class _cmark_Subject:
# This corresponds to the line.
# cmark_chunk input
self.input: str = input
self.input: _cmarkCmarkChunk = None
# cmark_reference_map *refmap
self.refmap = None
self.refmap: _cmarkCmarkReferenceMap = None
self.backticks: list = list(range(0, md_parser['cmark']['generic']['MAXBACKTICKS']))
self.scanned_for_backticks: bool = False
def push(self, node: _cmarkDelimiterDLLNode):
def push(self, node: _cmarkDelimiter):
r"""Add a new node."""
if self.start is None and self.last_delim is None:
# Empty list.
self.start = self.last_delim = node
else:
self.last_delim.next = node
node.previous = self.last_delim
if node.previous is not None:
# Connect last exising node to new node.
node.previous.next = node
self.last_delim = self.last_delim.next
node.next = None
def pop(self) -> _cmarkDelimiterDLLNode:
def pop(self) -> _cmarkDelimiter:
r"""Remove the last node."""
node = None
if self.start is None and self.last_delim is None:
@ -167,22 +238,27 @@ class _cmark_Subject:
return node
def extract(self, node: _cmarkDelimiterDLLNode):
r"""Remove a specific node."""
if node is None:
pass
elif self.start is not None and self.last_delim is not None:
if node == self.last_delim:
self.last_delim = node.previous
def extract(self, delim: _cmarkDelimiter):
r"""Remove a specific node.
if self.last_delim is None:
self.start = None
else:
node.next.previous = node.previous
if node.previous is not None:
node.previous.next = node.next
else:
self.start = node.next
This method is equivalent to the remove_delimiter
function in inlines.c
"""
if delim is None:
return
if delim.next is None:
# end of list:
if delim != self.last_delim:
raise ValueError
self.last_delim = delim.previous
else:
delim.next.previous = delim.previous
if delim.previous is not None:
delim.previous.next = delim.next
# subj.mem.free(delim)
def scroll(self):
r"""Print the list."""
@ -194,7 +270,7 @@ class _cmark_Subject:
x = x.next
class _cmark_Bracket:
class _cmarkBracket:
def __init__(self):
self.previous = None
self.previous_delimiter = None
@ -208,11 +284,13 @@ class _cmark_Bracket:
self.bracket_after = False
def _cmark_advance(subj: _cmark_Subject):
# 0.29, 0.30
def _cmark_advance(subj: _cmarkSubject):
# Advance the subject. Doesn't check for eof.
subj.pos += 1
# 0.29, 0.30
def _cmark_cmark_utf8proc_is_space(char: int, parser: str = 'github') -> bool:
r"""Match anything in the Zs class, plus LF, CR, TAB, FF."""
# license C applies here. See docs/copyright_license.rst.
@ -223,16 +301,7 @@ def _cmark_cmark_utf8proc_is_space(char: int, parser: str = 'github') -> bool:
return value
def _cmark_cmark_utf8proc_is_punctuation(char: int, parser: str = 'github') -> bool:
r"""Match anything in the P[cdefios] classes."""
# license C applies here. See docs/copyright_license.rst.
value = False
if chr(char) in md_parser[parser]['pseudo-re']['PC']:
value = True
return value
# 0.29, 0.30
def _cmark_cmark_ispunct(char: int, parser: str = 'github') -> bool:
r"""Return True if c is an ascii punctuation character."""
# license C applies here. See docs/copyright_license.rst.
@ -243,6 +312,18 @@ def _cmark_cmark_ispunct(char: int, parser: str = 'github') -> bool:
return value
# 0.29, 0.30
def _cmark_cmark_utf8proc_is_punctuation(char: int, parser: str = 'github') -> bool:
r"""Match anything in the P[cdefios] classes."""
# license C applies here. See docs/copyright_license.rst.
value = False
if (char < 128 and _cmark_cmark_ispunct(char)) or chr(char) in md_parser[parser]['pseudo-re']['PC']:
value = True
return value
# 0.29, 0.30
def _cmark_cmark_utf8proc_charlen(line: str, line_length: int) -> int:
# license D applies here. See docs/copyright_license.rst
length: int
@ -278,6 +359,7 @@ def _cmark_cmark_utf8proc_charlen(line: str, line_length: int) -> int:
return length
# 0.29, 0.30
def _cmark_cmark_utf8proc_iterate(line: str, line_len: int) -> tuple:
# license D applies here. See docs/copyright_license.rst
@ -313,23 +395,24 @@ def _cmark_cmark_utf8proc_iterate(line: str, line_len: int) -> tuple:
return length, dst
def _cmark_peek_at(subj: _cmark_Subject, pos: int) -> int:
# 0.29, 0.30
def _cmark_peek_at(subj: _cmarkSubject, pos: int) -> int:
# license C applies here. See docs/copyright_license.rst
return ord(subj.input.data[pos])
def _cmark_scan_delims(subj: _cmark_Subject, c: str) -> tuple:
# 0.29, 0.30
def _cmark_scan_delims(subj: _cmarkSubject, c: str) -> tuple:
# license C applies here. See docs/copyright_license.rst
numdelims: int = 0
before_char_pos: int = 0
after_char: int = 0
before_char: int = 0
left_flanking: bool = False
length: int = 0
left_flanking: bool = False
right_flanking: bool = False
can_open = False
can_close = False
can_open: bool = False
can_close: bool = False
if subj.pos == 0:
before_char = 10
@ -391,10 +474,11 @@ def _cmark_scan_delims(subj: _cmark_Subject, c: str) -> tuple:
return numdelims, can_open, can_close
def _cmark_push_delimiter(subj: _cmark_Subject, c: str, can_open: bool,
# 0.29, 0.30
def _cmark_push_delimiter(subj: _cmarkSubject, c: str, can_open: bool,
can_close: bool, inl_text: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
delim = _cmarkDelimiterDLLNode(c, inl_text.as_literal_len)
delim = _cmarkDelimiter(c, inl_text.length)
delim.can_open = can_open
delim.can_close = can_close
delim.inl_text = inl_text
@ -402,30 +486,61 @@ def _cmark_push_delimiter(subj: _cmark_Subject, c: str, can_open: bool,
# List operations are handled in the class definition.
# 0.29, 0.30
def _cmark_cmark_chunk_dup(ch: _cmarkCmarkChunk, pos: int, length: int) -> str:
# license E applies here. See docs/copyright_license.rst
return copy.deepcopy(ch.data[pos: pos + length])
c = _cmarkCmarkChunk(copy.deepcopy(ch.data[pos: pos + length]), length)
return c
def _cmark_handle_delim(subj: _cmark_Subject, c: str) -> int:
# 0.30
def _cmark_cmark_chunk_literal(data: str) -> _cmarkCmarkChunk:
length: int
c: _cmarkCmarkChunk
if data is not None:
length = len(data)
else:
length = 0
c = _cmarkCmarkChunk(data, length)
return c
# 0.29, 0.30
def _cmark_handle_delim(subj: _cmarkSubject, c: str, smart: bool = False) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
numdelims: int
can_open: bool
can_close: bool
inl_text: _cmarkCmarkNode
contents: str
numdelims, can_open, can_close = _cmark_scan_delims(subj, c)
contents = _cmark_cmark_chunk_dup(subj.input, subj.pos - numdelims, numdelims)
if c == '\'' and smart:
contents = _cmark_cmark_chunk_literal(md_parser['cmark']['generic']['RIGHTSINGLEQUOTE'])
elif c == '"' and smart:
if can_close:
contents = _cmark_cmark_chunk_literal(md_parser['cmark']['generic']['RIGHTDOUBLEQUOTE'])
else:
contents = _cmark_cmark_chunk_literal(md_parser['cmark']['generic']['LEFTDOUBLEQUOTE'])
else:
contents = _cmark_cmark_chunk_dup(subj.input, subj.pos - numdelims, numdelims)
inl_text = _cmark_make_str(subj, subj.pos - numdelims, subj.pos - 1, contents)
if (can_open or can_close) and (not (c == '\'' or c == '"')):
_cmark_push_delimiter(subj, c, can_open, can_close, inl_text)
return numdelims
return inl_text
def _cmark_peek_char(subj: _cmark_Subject) -> int:
# 0.29, 0.30
def _cmark_peek_char(subj: _cmarkSubject) -> int:
# license C applies here. See docs/copyright_license.rst
# Instead of using assert just raise a ValueError
if subj.pos < subj.input.length and ord(subj.input.data[subj.pos]) == 0:
raise ValueError
@ -435,13 +550,70 @@ def _cmark_peek_char(subj: _cmark_Subject) -> int:
return 0
def _cmark_remove_emph(delimiter_stack: _cmark_Subject, opener: _cmarkDelimiterDLLNode, closer: _cmarkDelimiterDLLNode, ignore: list):
# 0.30
# Unlink a node without adjusting its next, prev, and parent pointers.
def _cmark_S_node_unlink(node: _cmarkCmarkNode):
if node is None:
return
if node.prev:
node.prev.next = node.next
if node.next:
node.next.prev = node.prev
# Adjust first_child and last_child of parent.
# Start and end pointers.
parent: _cmarkCmarkNode = node.parent
if parent:
if parent.first_child == node:
parent.first_child = node.next
if parent.last_child == node:
parent.last_child = node.prev
# 0.30
def _cmark_S_free_nodes(e: _cmarkCmarkNode):
mem = e.mem
next: _cmarkCmarkNode
_noop(mem)
while e is not None:
# No need to run free operations.
if e.last_child:
# Splice children into list
e.last_child.next = e.next
e.next = e.first_child
next = e.next
# mem->free(e);
e = next
# 0.30
def _cmark_cmark_node_free(node: _cmarkCmarkNode):
_cmark_S_node_unlink(node)
node.next = None
_cmark_S_free_nodes(node)
# 0.29, 0.30
def _cmark_remove_emph(subj: _cmarkSubject, opener: _cmarkDelimiter, closer: _cmarkDelimiter, ignore: list):
# license C applies here. See docs/copyright_license.rst
# This function refers to S_insert_emph()
delim: _cmarkDelimiter
tmp_delim: _cmarkDelimiter
use_delims: int
opener_inl: _cmarkCmarkNode = opener.inl_text
closer_inl: _cmarkCmarkNode = closer.inl_text
opener_num_chars: int = opener_inl.as_literal_len
closer_num_chars: int = closer_inl.as_literal_len
opener_num_chars: int = opener_inl.length
closer_num_chars: int = closer_inl.length
tmp: _cmarkCmarkNode
tmpnext: _cmarkCmarkNode
emph: _cmarkCmarkNode
# calculate the actual number of characters used from this closer
if closer_num_chars >= 2 and opener_num_chars >= 2:
@ -452,22 +624,35 @@ def _cmark_remove_emph(delimiter_stack: _cmark_Subject, opener: _cmarkDelimiterD
# remove used characters from associated inlines.
opener_num_chars -= use_delims
closer_num_chars -= use_delims
opener_inl.as_literal_len = opener_num_chars
closer_inl.as_literal_len = closer_num_chars
opener_inl.length = opener_num_chars
closer_inl.length = closer_num_chars
# No need to add string terminators.
# opener_inl->data[opener_num_chars] = 0;
# closer_inl->data[closer_num_chars] = 0;
# free delimiters between opener and closer
delim = closer.previous
while delim is not None and delim != opener:
tmp_delim = delim.previous
delimiter_stack.extract(delim)
subj.extract(delim)
delim = tmp_delim
# IGNORE
#
# create new emph or strong, and splice it in to our inlines
# between the opener and closer
# emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);
# tmp = opener_inl->next;
# while (tmp && tmp != closer_inl) {
# tmpnext = tmp->next;
# cmark_node_unlink(tmp);
# append_child(emph, tmp);
# tmp = tmpnext;
# }
# cmark_node_insert_after(opener_inl, emph);
#
# Custom variables.
# Custom variables and computations.
opener_relative_start = opener_inl.end_column - use_delims + 1 - opener.offset
opener_relative_end = opener_inl.end_column + 1 - opener.offset
closer_relative_start = closer_inl.start_column + closer.offset
@ -481,22 +666,63 @@ def _cmark_remove_emph(delimiter_stack: _cmark_Subject, opener: _cmarkDelimiterD
# if opener has 0 characters, remove it and its associated inline
if opener_num_chars == 0:
delimiter_stack.extract(opener)
_cmark_cmark_node_free(opener_inl)
subj.extract(opener)
# if closer has 0 characters, remove it and its associated inline
if closer_num_chars == 0:
# remove empty closer inline
_cmark_cmark_node_free(closer_inl)
# remove closer from list
tmp_delim = closer.next
delimiter_stack.extract(closer)
subj.extract(closer)
closer = tmp_delim
return closer
def _cmark_process_emphasis(subj: _cmark_Subject, stack_bottom: _cmarkDelimiterDLLNode, ignore: list) -> list:
# 0.30
def _cmark_cmark_set_cstr(mem, dst: str, src: str) -> int:
old: str = dst
length: int
_noop(old)
if src and src[0]:
length = len(src)
dst = copy.deepcopy(src)
else:
length = 0
dst = None
# No need to free in Python.
return length
# 0.30
def _cmark_cmark_node_set_literal(node: _cmarkCmarkNode, content: str) -> int:
if node is None:
return 0
if (node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_HTML_BLOCK']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_HTML_INLINE']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_CODE']
or node.type == md_parser['cmark']['cmark_node_type']['CMARK_NODE_CODE_BLOCK']):
length, data = _cmark_cmark_set_cstr(node.mem, content)
node.length = length
node.data = data
return 1
return 0
# 0.29, 0.30
def _cmark_process_emphasis(subj: _cmarkSubject, stack_bottom: _cmarkDelimiter, ignore: list) -> list:
# license C applies here. See docs/copyright_license.rst
closer: _cmarkDelimiterDLLNode = subj.last_delim
opener: _cmarkDelimiterDLLNode
closer: _cmarkDelimiter = subj.last_delim
opener: _cmarkDelimiter
openers_bottom_index: int = 0
opener_found: bool
openers_bottom_index: int = 0
@ -542,14 +768,14 @@ def _cmark_process_emphasis(subj: _cmark_Subject, stack_bottom: _cmarkDelimiterD
closer = closer.next
elif closer.delim_char == '\'':
closer.inl_text.as_literal = md_parser['cmark']['generic']['RIGHTSINGLEQUOTE']
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTSINGLEQUOTE'])
if opener_found:
opener.inl_text.as_literal = md_parser['cmark']['generic']['LEFTSINGLEQUOTE']
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTSINGLEQUOTE'])
closer = closer.next
elif closer.delim_char == '"':
closer.inl_text.as_literal = md_parser['cmark']['generic']['RIGHTDOUBLEQUOTE']
_cmark_cmark_node_set_literal(closer.inl_text, md_parser['cmark']['generic']['RIGHTDOUBLEQUOTE'])
if opener_found:
opener.inl_text.as_literal = md_parser['cmark']['generic']['LEFTDOUBLEQUOTE']
_cmark_cmark_node_set_literal(opener.inl_text, md_parser['cmark']['generic']['LEFTDOUBLEQUOTE'])
closer = closer.next
if not opener_found:
@ -568,7 +794,8 @@ def _cmark_process_emphasis(subj: _cmark_Subject, stack_bottom: _cmarkDelimiterD
subj.extract(subj.last_delim)
def _cmark_skip_line_end(subj: _cmark_Subject) -> bool:
# 0.29, 0.30
def _cmark_skip_line_end(subj: _cmarkSubject) -> bool:
# license C applies here. See docs/copyright_license.rst
seen_line_end_char: bool = False
@ -581,19 +808,23 @@ def _cmark_skip_line_end(subj: _cmark_Subject) -> bool:
return seen_line_end_char or _cmark_is_eof(subj)
def _cmark_make_simple(mem) -> _cmarkCmarkNode:
# 0.29, 0.30
def _cmark_make_simple(mem, t: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
e = _cmarkCmarkNode()
e.content = copy.deepcopy(mem)
e.mem = copy.deepcopy(mem)
e.type = t
return e
# 0.29, 0.30
def _cmark_make_linebreak(mem):
# license C applies here. See docs/copyright_license.rst
_cmark_make_simple(mem)
_cmark_make_simple(mem, md_parser['cmark']['cmark_node_type']['CMARK_NODE_LINEBREAK'])
def _cmark_handle_backslash(subj: _cmark_Subject):
# 0.29, 0.30
def _cmark_handle_backslash(subj: _cmarkSubject):
r"""Parse backslash-escape or just a backslash, returning an inline."""
# license C applies here. See docs/copyright_license.rst
_cmark_advance(subj)
@ -606,37 +837,50 @@ def _cmark_handle_backslash(subj: _cmark_Subject):
elif (not _cmark_is_eof(subj)) and _cmark_skip_line_end(subj):
return _cmark_make_linebreak(subj.mem)
else:
return _cmark_make_str(subj, subj.pos - 1, subj.pos - 1, '\\')
return _cmark_make_str(subj, subj.pos - 1, subj.pos - 1, _cmark_cmark_chunk_literal('\\'))
def _cmark_make_literal(subj: _cmark_Subject, start_column: int, end_column: int, char: str):
# 0.29, 0.30
def _cmark_make_literal(subj: _cmarkSubject, t: int, start_column: int, end_column: int) -> _cmarkCmarkNode:
# license C applies here. See docs/copyright_license.rst
r"""Create an inline with a literal string value."""
e = _cmarkCmarkNode()
# cmark_strbuf_init(subj->mem, &e->content, 0)
e.content = copy.deepcopy(subj.mem)
e.as_literal: str = char
e.as_literal_len: int = len(char)
e.start_line = subj.line
e.end_line = subj.line
e.mem = copy.deepcopy(subj.mem)
e.type = t
e.start_line = e.end_line = subj.line
# columns are NOT 1 based.
e.start_column: int = start_column + subj.column_offset + subj.block_offset
e.end_column: int = end_column + subj.column_offset + subj.block_offset
return e
# 0.29, 0.30
def _cmark_make_str(subj: _cmarkSubject, sc: int, ec: int, s: _cmarkCmarkChunk) -> _cmarkCmarkNode:
# s = char
# sc = start column
# ec = end cloumn
# license C applies here. See docs/copyright_license.rst
e = _cmark_make_literal(subj, md_parser['cmark']['cmark_node_type']['CMARK_NODE_TEXT'], sc, ec)
# Realloc with NULL ptr is equal to malloc, so no need to translate
# this operation:
# e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
if s.data is not None:
e.data = copy.deepcopy(s.data)
# No need to add line terminator (\0).
e.length = s.length
return e
def _cmark_make_str(subj: _cmark_Subject, start_column: int, end_column: int, char: str):
# 0.29, 0.30
def _cmark_push_bracket(subj: _cmarkSubject, image: bool, inl_text: _cmarkCmarkNode):
# license C applies here. See docs/copyright_license.rst
return _cmark_make_literal(subj, start_column, end_column, char)
def _cmark_push_bracket(subj: _cmark_Subject, image: bool, inl_text: str):
# license C applies here. See docs/copyright_license.rst
b = _cmark_Bracket()
b = _cmarkBracket()
if subj.last_bracket is not None:
subj.last_bracket.bracket_after = True
b.image = image
@ -649,7 +893,8 @@ def _cmark_push_bracket(subj: _cmark_Subject, image: bool, inl_text: str):
subj.last_bracket = b
def _cmark_subject_find_special_char(subj: _cmark_Subject, options: int) -> int:
# 0.29, 0.30
def _cmark_subject_find_special_char(subj: _cmarkSubject, options: int) -> int:
# license C applies here. See docs/copyright_license.rst
# "\r\n\\`&_*[]<!"
@ -682,35 +927,38 @@ def _cmark_subject_find_special_char(subj: _cmark_Subject, options: int) -> int:
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
]
CMARK_OPT_SMART = 1 << 10
n: int = subj.pos + 1
# Patch to avoid overflow problems.
if n > subj.input.length - 1:
return subj.input.length
if ord(subj.input.data[n]) > len(SPECIAL_CHARS) - 1 or ord(subj.input.data[n]) > len(SMART_PUNCT_CHARS) - 1:
return n
# End patch.
while n < subj.input.length:
if SPECIAL_CHARS[ord(subj.input.data[n])] == 1:
return n
if options & CMARK_OPT_SMART and SMART_PUNCT_CHARS[ord(subj.input.data[n])]:
if options & md_parser['cmark']['generic']['CMARK_OPT_SMART'] and SMART_PUNCT_CHARS[ord(subj.input.data[n])]:
return n
n += 1
return subj.input.length
# 0.29, 0.30
def _cmark_subject_from_buf(mem, line_number: int,
block_offset: int, e: _cmark_Subject,
chunk: _cmarkCmarkChunk, cmark_reference_map=None):
block_offset: int, e: _cmarkSubject, chunk: _cmarkCmarkChunk,
refmap: _cmarkCmarkReferenceMap):
# license C applies here. See docs/copyright_license.rst
i: int
e.mem = mem
e.input = chunk
e.line = line_number
e.pos = 0
e.block_offset = block_offset
e.column_offset = 0
e.refmap = None
e.refmap = refmap
e.last_delim = None
e.last_bracket = None
@ -720,7 +968,28 @@ def _cmark_subject_from_buf(mem, line_number: int,
e.scanned_for_backticks = False
def _cmark_parse_inline(subj: _cmark_Subject, options: int = 0) -> int:
# 0.30
def _cmark_S_is_line_end_char(c: str) -> bool:
return c == '\n' or c == '\r'
# 0.30
# /**
# * Returns 1 if c is a "whitespace" character as defined by the spec.
# */
# int cmark_isspace(char c) { return cmark_ctype_class[(uint8_t)c] == 1; }
# The only defined whitespaces in the spec are Unicode whitespaces.
def _cmark_cmark_chunk_rtrim(c: _cmarkCmarkChunk):
while c.length > 0:
# if (!cmark_isspace(c->data[c->len - 1]))
if not c.data[c.length - 1] in md_parser['cmark']['pseudo-re']['UWC']:
break
c.length -= 1
# 0.29, 0.30
def _cmark_parse_inline(subj: _cmarkSubject, parent: _cmarkCmarkNode, options: int = 0) -> int:
r"""Handle all the different elements of a string."""
# license C applies here. See docs/copyright_license.rst
new_inl: _cmarkCmarkNode = None
@ -732,20 +1001,20 @@ def _cmark_parse_inline(subj: _cmark_Subject, options: int = 0) -> int:
c = _cmark_peek_char(subj)
if c == 0:
return 0
elif chr(c) == '`':
# TODO
_cmark_advance(subj)
elif chr(c) == '\\':
new_inl = _cmark_handle_backslash(subj)
elif chr(c) == '*' or chr(c) == '_' or chr(c) == '\'' or chr(c) == '"':
new_inl = _cmark_handle_delim(subj, chr(c))
new_inl = _cmark_handle_delim(subj, chr(c), (options & md_parser['cmark']['generic']['CMARK_OPT_SMART']) != 0)
elif chr(c) == '[':
_cmark_advance(subj)
new_inl = _cmark_make_str(subj, subj.pos - 1, subj.pos - 1, '[')
new_inl = _cmark_make_str(subj, subj.pos - 1, subj.pos - 1, _cmark_cmark_chunk_literal('['))
_cmark_push_bracket(subj, False, new_inl)
elif chr(c) == ']':
# TODO
_cmark_advance(subj)
elif chr(c) == '`':
# TODO
_cmark_advance(subj)
# TODO: images, code, HTML tags detection.
@ -755,24 +1024,58 @@ def _cmark_parse_inline(subj: _cmark_Subject, options: int = 0) -> int:
startpos = subj.pos
subj.pos = endpos
'''
// if we're at a newline, strip trailing spaces.
if (S_is_line_end_char(peek_char(subj))) {
cmark_chunk_rtrim(&contents);
}
'''
# if we're at a newline, strip trailing spaces.
if _cmark_S_is_line_end_char(_cmark_peek_char(subj)):
_cmark_cmark_chunk_rtrim(contents)
new_inl = _cmark_make_str(subj, startpos, endpos - 1, contents)
"""
if new_inl is not None:
_cmark_cmark_node_append_child(parent, new_inl)
"""
parent.append_child_lite(new_inl)
return 1
def _cmark_is_eof(subj: _cmark_Subject):
# 0.30
def _cmark_pop_bracket(subj: _cmarkSubject):
b: _cmarkBracket
if subj.last_bracket is None:
return
b = subj.last_bracket
subj.last_bracket = subj.last_bracket.previous
# No need to free.
# subj->mem->free(b);
_noop(b)
# 0.30
def _cmark_cmark_parse_inlines(mem, parent: _cmarkCmarkNode,
refmap: _cmarkCmarkReferenceMap, options: int) -> list:
r"""Get the ignore list."""
subj: _cmarkSubject
content: _cmarkCmarkChunk = _cmarkCmarkChunk(parent.data, parent.length)
subj = _cmarkSubject()
_cmark_subject_from_buf(mem, parent.start_line, parent.start_column - 1 + parent.internal_offset, subj, content, refmap)
_cmark_cmark_chunk_rtrim(subj.input)
while not _cmark_is_eof(subj):
_cmark_parse_inline(subj, parent, options)
ignore = list()
# When we hit the end of the input, we call the process emphasis procedure (see below), with stack_bottom = NULL.
_cmark_process_emphasis(subj, None, ignore)
# free bracket and delim stack
while subj.last_delim:
subj.extract(subj.last_delim)
while subj.last_bracket:
_cmark_pop_bracket(subj)
return ignore
# 0.30
def _cmark_is_eof(subj: _cmarkSubject):
r"""Return true if there are more characters in the subject."""
# license C applies here. See docs/copyright_license.rst.
return subj.pos >= subj.input.length

View File

@ -48,9 +48,48 @@ parser['cmark']['generic'] = {
'RIGHTDOUBLEQUOTE': '\xE2\x80\x9D',
'LEFTSINGLEQUOTE': '\xE2\x80\x98',
'RIGHTSINGLEQUOTE': '\xE2\x80\x99',
'CMARK_OPT_SMART': 1 << 10,
'MAXBACKTICKS': 1000,
}
parser['cmark']['cmark_node_type'] = {
# typedef enum { ... } cmark_node_type;
# Undefined value in the C source code get their value
# accoring to their position in the sequence, like an array.
# Error status
'CMARK_NODE_NONE': 0,
# Block
'CMARK_NODE_DOCUMENT': 1,
'CMARK_NODE_BLOCK_QUOTE': 2,
'CMARK_NODE_LIST': 3,
'CMARK_NODE_ITEM': 4,
'CMARK_NODE_CODE_BLOCK': 5,
'CMARK_NODE_HTML_BLOCK': 6,
'CMARK_NODE_CUSTOM_BLOCK': 7,
'CMARK_NODE_PARAGRAPH': 8,
'CMARK_NODE_HEADING': 9,
'CMARK_NODE_THEMATIC_BREAK': 10,
# Inline
'CMARK_NODE_TEXT': 11,
'CMARK_NODE_SOFTBREAK': 12,
'CMARK_NODE_LINEBREAK': 13,
'CMARK_NODE_CODE': 14,
'CMARK_NODE_HTML_INLINE': 15,
'CMARK_NODE_CUSTOM_INLINE': 16,
'CMARK_NODE_EMPH': 17,
'CMARK_NODE_STRONG': 18,
'CMARK_NODE_LINK': 19,
'CMARK_NODE_IMAGE': 20,
}
parser['cmark']['cmark_node_type']['CMARK_NODE_FIRST_BLOCK'] = 'CMARK_NODE_DOCUMENT'
parser['cmark']['cmark_node_type']['CMARK_NODE_LAST_BLOCK'] = 'CMARK_NODE_THEMATIC_BREAK'
parser['cmark']['cmark_node_type']['CMARK_NODE_FIRST_INLINE'] = 'CMARK_NODE_TEXT'
parser['cmark']['cmark_node_type']['CMARK_NODE_LAST_INLINE'] = 'CMARK_NODE_IMAGE'
parser['cmark']['link'] = {
'max chars label': 999,
}
@ -86,22 +125,814 @@ parser['cmark']['pseudo-re'] = {
# See https://www.fileformat.info/info/unicode/category/Zs/list.htm
# for the Zs characters.
# Unicode Whitespace Character.
'UWC': ['\u0020', '\u00A0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u202F', '\u205F', '\u3000', '\u0009', '\u000D', '\u000A', '\u000D'],
'UWC': [
'\u0009',
'\u000A',
'\u000C',
'\u000D',
'\u0020',
'\u00A0',
'\u1680',
'\u2000',
'\u2001',
'\u2002',
'\u2003',
'\u2004',
'\u2005',
'\u2006',
'\u2007',
'\u2008',
'\u2009',
'\u200A',
'\u202F',
'\u205F',
'\u3000'
],
# ASCII punctuation characters.
'APC': ['\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\u0027', '\u0028', '\u0029', '\u002A', '\u002B', '\u002C', '\u002D', '\u002E', '\u002F3', '\u003A', '\u003B', '\u003C', '\u003D', '\u003E', '\u003F', '\u0040', '\u005B', '\u005E', '\u005F', '\u0060', '\u007B', '\u007C', '\u007D', '\u007E'],
'APC': [
'\u0021',
'\u0022',
'\u0023',
'\u0024',
'\u0025',
'\u0026',
'\u0027',
'\u0028',
'\u0029',
'\u002A',
'\u002B',
'\u002C',
'\u002D',
'\u002E',
'\u002F3',
'\u003A',
'\u003B',
'\u003C',
'\u003D',
'\u003E',
'\u003F',
'\u0040',
'\u005B',
'\u005E',
'\u005F',
'\u0060',
'\u007B',
'\u007C',
'\u007D',
'\u007E'
],
# Punctuation General Unicode Categories.
'PGUCPC': ['\u005F', '\u203F', '\u2040', '\u2054', '\uFE33', '\uFE33', '\uFE4D', '\uFE4E', '\uFE4F', '\uFF3F'],
'PGUCPD': ['\u002D', '\u058A', '\u05BE', '\u1400', '\u1806', '\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015', '\u2E17', '\u2E1A', '\u2E3A', '\u2E3B', '\u2E40', '\u301C', '\u3030', '\u30A0', '\uFE31', '\uFE32', '\uFE58', '\uFE63', '\uFF0D', '\u10EAD'],
'PGUCPF': ['\u00BB', '\u2019', '\u201D', '\u203A', '\u2E03', '\u2E05', '\u2E0A', '\u2E0D', '\u2E1D', '\u2E21'],
'PGUCPI': ['\u00AB', '\u2018', '\u201B', '\u201C', '\u201F', '\u2039', '\u2E02', '\u2E04', '\u2E09', '\u2E0C', '\u2E1C', '\u2E20'],
'PGUCPO': ['\u0021', '\u0022', '\u0023', '\u0025', '\u0026', '\u0027', '\u002A', '\u002C', '\u002E', '\u002F', '\u003A', '\u003B', '\u003F', '\u0040', '\u005C', '\u00A1', '\u00A7', '\u00B6', '\u00B7', '\u00BF', '\u037E', '\u0387', '\u055A', '\u055B', '\u055C', '\u055D', '\u055E', '\u055F', '\u0589', '\u05C0', '\u05C3', '\u05C6', '\u05F3', '\u05F4', '\u0609', '\u060A', '\u060C', '\u060D', '\u061B', '\u061E', '\u061F', '\u066A', '\u066B', '\u066C', '\u066D', '\u06D4', '\u0700', '\u0701', '\u0702', '\u0703', '\u0704', '\u0705', '\u0706', '\u0707', '\u0708', '\u0709', '\u070A', '\u070B', '\u070C', '\u070D', '\u07F7', '\u07F8', '\u07F9', '\u0830', '\u0831', '\u0832', '\u0833', '\u0834', '\u0835', '\u0836', '\u0837', '\u0838', '\u0839', '\u083A', '\u083B', '\u083C', '\u083D', '\u083E', '\u085E', '\u0964', '\u0965', '\u0970', '\u09FD', '\u0A76', '\u0AF0', '\u0C77', '\u0C84', '\u0DF4', '\u0E4F', '\u0E5A', '\u0E5B', '\u0F04', '\u0F05', '\u0F06', '\u0F07', '\u0F08', '\u0F09', '\u0F0A', '\u0F0B', '\u0F0C', '\u0F0D', '\u0F0E', '\u0F0F', '\u0F10', '\u0F11', '\u0F12', '\u0F14', '\u0F85', '\u0FD0', '\u0FD1', '\u0FD2', '\u0FD3', '\u0FD4', '\u0FD9', '\u0FDA', '\u104A', '\u104B', '\u104C', '\u104D', '\u104E', '\u104F', '\u10FB', '\u1360', '\u1361', '\u1362', '\u1363', '\u1364', '\u1365', '\u1366', '\u1367', '\u1368', '\u166E', '\u16EB', '\u16EC', '\u16ED', '\u1735', '\u1736', '\u17D4', '\u17D5', '\u17D6', '\u17D8', '\u17D9', '\u17DA', '\u1800', '\u1801', '\u1802', '\u1803', '\u1804', '\u1805', '\u1807', '\u1808', '\u1809', '\u180A', '\u1944', '\u1945', '\u1A1E', '\u1A1F', '\u1AA0', '\u1AA1', '\u1AA2', '\u1AA3', '\u1AA4', '\u1AA5', '\u1AA6', '\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB', '\u1AAC', '\u1AAD', '\u1B5A', '\u1B5B', '\u1B5C', '\u1B5D', '\u1B5E', '\u1B5F', '\u1B60', '\u1BFC', '\u1BFD', '\u1BFE', '\u1BFF', '\u1C3B', '\u1C3C', '\u1C3D', '\u1C3E', '\u1C3F', '\u1C7E', '\u1C7F', '\u1CC0', '\u1CC1', '\u1CC2', '\u1CC3', '\u1CC4', '\u1CC5', '\u1CC6', '\u1CC7', '\u1CD3', '\u2016', '\u2017', '\u2020', '\u2021', '\u2022', '\u2023', '\u2024', '\u2025', '\u2026', '\u2027', '\u2030', '\u2031', '\u2032', '\u2033', '\u2034', '\u2035', '\u2036', '\u2037', '\u2038', '\u203B', '\u203C', '\u203D', '\u203E', '\u2041', '\u2042', '\u2043', '\u2047', '\u2048', '\u2049', '\u204A', '\u204B', '\u204C', '\u204D', '\u204E', '\u204F', '\u2050', '\u2051', '\u2053', '\u2055', '\u2056', '\u2057', '\u2058', '\u2059', '\u205A', '\u205B', '\u205C', '\u205D', '\u205E', '\u2CF9', '\u2CFA', '\u2CFB', '\u2CFC', '\u2CFE', '\u2CFF', '\u2D70', '\u2E00', '\u2E01', '\u2E06', '\u2E07', '\u2E08', '\u2E0B', '\u2E0E', '\u2E0F', '\u2E10', '\u2E11', '\u2E12', '\u2E13', '\u2E14', '\u2E15', '\u2E16', '\u2E18', '\u2E19', '\u2E1B', '\u2E1E', '\u2E1F', '\u2E2A', '\u2E2B', '\u2E2C', '\u2E2D', '\u2E2E', '\u2E30', '\u2E31', '\u2E32', '\u2E33', '\u2E34', '\u2E35', '\u2E36', '\u2E37', '\u2E38', '\u2E39', '\u2E3C', '\u2E3D', '\u2E3E', '\u2E3F', '\u2E41', '\u2E43', '\u2E44', '\u2E45', '\u2E46', '\u2E47', '\u2E48', '\u2E49', '\u2E4A', '\u2E4B', '\u2E4C', '\u2E4D', '\u2E4E', '\u2E4F', '\u2E52', '\u3001', '\u3002', '\u3003', '\u303D', '\u30FB', '\uA4FE', '\uA4FF', '\uA60D', '\uA60E', '\uA60F', '\uA673', '\uA67E', '\uA6F2', '\uA6F3', '\uA6F4', '\uA6F5', '\uA6F6', '\uA6F7', '\uA874', '\uA875', '\uA876', '\uA877', '\uA8CE', '\uA8CF', '\uA8F8', '\uA8F9', '\uA8FA', '\uA8FC', '\uA92E', '\uA92F', '\uA95F', '\uA9C1', '\uA9C2', '\uA9C3', '\uA9C4', '\uA9C5', '\uA9C6', '\uA9C7', '\uA9C8', '\uA9C9', '\uA9CA', '\uA9CB', '\uA9CC', '\uA9CD', '\uA9DE', '\uA9DF', '\uAA5C', '\uAA5D', '\uAA5E', '\uAA5F', '\uAADE', '\uAADF', '\uAAF0', '\uAAF1', '\uABEB', '\uFE10', '\uFE11', '\uFE12', '\uFE13', '\uFE14', '\uFE15', '\uFE16', '\uFE19', '\uFE30', '\uFE45', '\uFE46', '\uFE49', '\uFE4A', '\uFE4B', '\uFE4C', '\uFE50', '\uFE51', '\uFE52', '\uFE54', '\uFE55', '\uFE56', '\uFE57', '\uFE5F', '\uFE60', '\uFE61', '\uFE68', '\uFE6A', '\uFE6B', '\uFF01', '\uFF02', '\uFF03', '\uFF05', '\uFF06', '\uFF07', '\uFF0A', '\uFF0C', '\uFF0E', '\uFF0F', '\uFF1A', '\uFF1B', '\uFF1F', '\uFF20', '\uFF3C', '\uFF61', '\uFF64', '\uFF65', '\u10100', '\u10101', '\u10102', '\u1039F', '\u103D0', '\u1056F', '\u10857', '\u1091F', '\u1093F', '\u10A50', '\u10A51', '\u10A52', '\u10A53', '\u10A54', '\u10A55', '\u10A56', '\u10A57', '\u10A58', '\u10A7F', '\u10AF0', '\u10AF1', '\u10AF2', '\u10AF3', '\u10AF4', '\u10AF5', '\u10AF6', '\u10B39', '\u10B3A', '\u10B3B', '\u10B3C', '\u10B3D', '\u10B3E', '\u10B3F', '\u10B99', '\u10B9A', '\u10B9B', '\u10B9C', '\u10F55', '\u10F56', '\u10F57', '\u10F58', '\u10F59', '\u11047', '\u11048', '\u11049', '\u1104A', '\u1104B', '\u1104C', '\u1104D', '\u110BB', '\u110BC', '\u110BE', '\u110BF', '\u110C0', '\u110C1', '\u11140', '\u11141', '\u11142', '\u11143', '\u11174', '\u11175', '\u111C5', '\u111C6', '\u111C7', '\u111C8', '\u111CD', '\u111DB', '\u111DD', '\u111DE', '\u111DF', '\u11238', '\u11239', '\u1123A', '\u1123B', '\u1123C', '\u1123D', '\u112A9', '\u1144B', '\u1144C', '\u1144D', '\u1144E', '\u1144F', '\u1145A', '\u1145B', '\u1145D', '\u114C6', '\u115C1', '\u115C2', '\u115C3', '\u115C4', '\u115C5', '\u115C6', '\u115C7', '\u115C8', '\u115C9', '\u115CA', '\u115CB', '\u115CC', '\u115CD', '\u115CE', '\u115CF', '\u115D0', '\u115D1', '\u115D2', '\u115D3', '\u115D4', '\u115D5', '\u115D6', '\u115D7', '\u11641', '\u11642', '\u11643', '\u11660', '\u11661', '\u11662', '\u11663', '\u11664', '\u11665', '\u11666', '\u11667', '\u11668', '\u11669', '\u1166A', '\u1166B', '\u1166C', '\u1173C', '\u1173D', '\u1173E', '\u1183B', '\u11944', '\u11945', '\u11946', '\u119E2', '\u11A3F', '\u11A40', '\u11A41', '\u11A42', '\u11A43', '\u11A44', '\u11A45', '\u11A46', '\u11A9A', '\u11A9B', '\u11A9C', '\u11A9E', '\u11A9F', '\u11AA0', '\u11AA1', '\u11AA2', '\u11C41', '\u11C42', '\u11C43', '\u11C44', '\u11C45', '\u11C70', '\u11C71', '\u11EF7', '\u11EF8', '\u11FFF', '\u12470', '\u12471', '\u12472', '\u12473', '\u12474', '\u16A6E', '\u16A6F', '\u16AF5', '\u16B37', '\u16B38', '\u16B39', '\u16B3A', '\u16B3B', '\u16B44', '\u16E97', '\u16E98', '\u16E99', '\u16E9A', '\u16FE2', '\u1BC9F', '\u1DA87', '\u1DA88', '\u1DA89', '\u1DA8A', '\u1DA8B', '\u1E95E', '\u1E95F'],
'PGUCPS': ['\u0028', '\u005B', '\u007B', '\u0F3A', '\u0F3C', '\u169B', '\u201A', '\u201E', '\u2045', '\u207D', '\u208D', '\u2308', '\u230A', '\u2329', '\u2768', '\u276A', '\u276C', '\u276E', '\u2770', '\u2772', '\u2774', '\u27C5', '\u27E6', '\u27E8', '\u27EA', '\u27EC', '\u27EE', '\u2983', '\u2985', '\u2987', '\u2989', '\u298B', '\u298D', '\u298F', '\u2991', '\u2993', '\u2995', '\u2997', '\u29D8', '\u29DA', '\u29FC', '\u2E22', '\u2E24', '\u2E26', '\u2E28', '\u2E42', '\u3008', '\u300A', '\u300C', '\u300E', '\u3010', '\u3014', '\u3016', '\u3018', '\u301A', '\u301D', '\uFD3F', '\uFE17', '\uFE35', '\uFE37', '\uFE39', '\uFE3B', '\uFE3D', '\uFE3F', '\uFE41', '\uFE43', '\uFE47', '\uFE59', '\uFE5B', '\uFE5D', '\uFF08', '\uFF3B', '\uFF5B', '\uFF5F', '\uFF62'],
'PGUCPC': [
'\u005F',
'\u203F',
'\u2040',
'\u2054',
'\uFE33',
'\uFE33',
'\uFE4D',
'\uFE4E',
'\uFE4F',
'\uFF3F'
],
'PGUCPD': [
'\u002D',
'\u058A',
'\u05BE',
'\u1400',
'\u1806',
'\u2010',
'\u2011',
'\u2012',
'\u2013',
'\u2014',
'\u2015',
'\u2E17',
'\u2E1A',
'\u2E3A',
'\u2E3B',
'\u2E40',
'\u301C',
'\u3030',
'\u30A0',
'\uFE31',
'\uFE32',
'\uFE58',
'\uFE63',
'\uFF0D',
'\u10EAD'
],
'PGUCPF': [
'\u00BB',
'\u2019',
'\u201D',
'\u203A',
'\u2E03',
'\u2E05',
'\u2E0A',
'\u2E0D',
'\u2E1D',
'\u2E21'
],
'PGUCPI': [
'\u00AB',
'\u2018',
'\u201B',
'\u201C',
'\u201F',
'\u2039',
'\u2E02',
'\u2E04',
'\u2E09',
'\u2E0C',
'\u2E1C',
'\u2E20'
],
'PGUCPO': [
'\u0021',
'\u0022',
'\u0023',
'\u0025',
'\u0026',
'\u0027',
'\u002A',
'\u002C',
'\u002E',
'\u002F',
'\u003A',
'\u003B',
'\u003F',
'\u0040',
'\u005C',
'\u00A1',
'\u00A7',
'\u00B6',
'\u00B7',
'\u00BF',
'\u037E',
'\u0387',
'\u055A',
'\u055B',
'\u055C',
'\u055D',
'\u055E',
'\u055F',
'\u0589',
'\u05C0',
'\u05C3',
'\u05C6',
'\u05F3',
'\u05F4',
'\u0609',
'\u060A',
'\u060C',
'\u060D',
'\u061B',
'\u061E',
'\u061F',
'\u066A',
'\u066B',
'\u066C',
'\u066D',
'\u06D4',
'\u0700',
'\u0701',
'\u0702',
'\u0703',
'\u0704',
'\u0705',
'\u0706',
'\u0707',
'\u0708',
'\u0709',
'\u070A',
'\u070B',
'\u070C',
'\u070D',
'\u07F7',
'\u07F8',
'\u07F9',
'\u0830',
'\u0831',
'\u0832',
'\u0833',
'\u0834',
'\u0835',
'\u0836',
'\u0837',
'\u0838',
'\u0839',
'\u083A',
'\u083B',
'\u083C',
'\u083D',
'\u083E',
'\u085E',
'\u0964',
'\u0965',
'\u0970',
'\u09FD',
'\u0A76',
'\u0AF0',
'\u0C77',
'\u0C84',
'\u0DF4',
'\u0E4F',
'\u0E5A',
'\u0E5B',
'\u0F04',
'\u0F05',
'\u0F06',
'\u0F07',
'\u0F08',
'\u0F09',
'\u0F0A',
'\u0F0B',
'\u0F0C',
'\u0F0D',
'\u0F0E',
'\u0F0F',
'\u0F10',
'\u0F11',
'\u0F12',
'\u0F14',
'\u0F85',
'\u0FD0',
'\u0FD1',
'\u0FD2',
'\u0FD3',
'\u0FD4',
'\u0FD9',
'\u0FDA',
'\u104A',
'\u104B',
'\u104C',
'\u104D',
'\u104E',
'\u104F',
'\u10FB',
'\u1360',
'\u1361',
'\u1362',
'\u1363',
'\u1364',
'\u1365',
'\u1366',
'\u1367',
'\u1368',
'\u166E',
'\u16EB',
'\u16EC',
'\u16ED',
'\u1735',
'\u1736',
'\u17D4',
'\u17D5',
'\u17D6',
'\u17D8',
'\u17D9',
'\u17DA',
'\u1800',
'\u1801',
'\u1802',
'\u1803',
'\u1804',
'\u1805',
'\u1807',
'\u1808',
'\u1809',
'\u180A',
'\u1944',
'\u1945',
'\u1A1E',
'\u1A1F',
'\u1AA0',
'\u1AA1',
'\u1AA2',
'\u1AA3',
'\u1AA4',
'\u1AA5',
'\u1AA6',
'\u1AA8',
'\u1AA9',
'\u1AAA',
'\u1AAB',
'\u1AAC',
'\u1AAD',
'\u1B5A',
'\u1B5B',
'\u1B5C',
'\u1B5D',
'\u1B5E',
'\u1B5F',
'\u1B60',
'\u1BFC',
'\u1BFD',
'\u1BFE',
'\u1BFF',
'\u1C3B',
'\u1C3C',