278 lines
7.5 KiB
Python
278 lines
7.5 KiB
Python
#!/usr/bin/env python
|
|
# *****************************************************************************
|
|
# Copyright (C) 2018-2023 Thomas Touhey <thomas@touhey.fr>
|
|
# This file is part of the textoutpc project, which is MIT-licensed.
|
|
# *****************************************************************************
|
|
"""Lexer definition for textoutpc."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Iterator
|
|
from io import StringIO
|
|
from typing import Any, NamedTuple, TextIO, Union
|
|
|
|
import regex
|
|
from typing_extensions import TypeAlias
|
|
|
|
|
|
__all__ = [
|
|
"CloseTagEntity",
|
|
"NewlineEntity",
|
|
"OpenTagEntity",
|
|
"SpecialEntity",
|
|
"TextEntity",
|
|
"iter_textout_entities",
|
|
]
|
|
|
|
# A tag can basically be one of the following things:
|
|
# - a starting tag, looking like [<name>] or [<name>=<attribute>]
|
|
# - an ending tag, looking like [/<name>]
|
|
# - a special tag (starting or ending), usually one-char (the only
|
|
# one currently available is the ` tag).
|
|
#
|
|
# A tag name is 32 chars at most (at least 1 char).
|
|
# A closing tag can have no name, which means that it will close the
|
|
# last opened tag automatically.
|
|
# A tag attribute is 256 chars at most.
|
|
#
|
|
# FIXME: Check the sizes.
|
|
MAX_TAG_NAME_SIZE: int = 32
|
|
MAX_TAG_VALUE_SIZE: int = 256
|
|
MAX_ENTITY_SIZE: int = MAX_TAG_NAME_SIZE + MAX_TAG_VALUE_SIZE + 3
|
|
BUFFER_SIZE: int = 1024 # Must be more than MAX_ENTITY_SIZE!
|
|
ENTITY_RE = regex.compile(
|
|
r"""
|
|
\[\s*[\\\/] (?P<ename>
|
|
(?P<ename_e>
|
|
[^\[\]\=]+ (\[(?&ename_e)*\]?)*
|
|
| [^\[\]\=]* (\[(?&ename_e)*\]?)+
|
|
)*
|
|
)
|
|
\s?\]
|
|
|
|
|
\[\s* (?P<bname>
|
|
(?P<bname_e>
|
|
[^\[\]\=]* (\[(?P&bname_e)*\]?)+
|
|
| [^\[\]\=]+ (\[(?P&bname_e)*\]?)*
|
|
)+
|
|
)
|
|
(\s* = \s* (?P<value>
|
|
(?P<value_e>
|
|
[^\[\]]* (\[(?&value_e)*\]?)+
|
|
| [^\[\]]+ (\[(?&value_e)*\]?)*
|
|
)*
|
|
))?
|
|
\s?\]
|
|
|
|
|
(?P<newline>\n|\r\n|\r)
|
|
|
|
|
(?P<sname>`)
|
|
""",
|
|
regex.VERBOSE | regex.DOTALL | regex.MULTILINE,
|
|
)
|
|
|
|
|
|
class OpenTagEntity(NamedTuple):
|
|
"""Explicit opening of a tag."""
|
|
|
|
name: str
|
|
"""Name of the tag that is being opened."""
|
|
|
|
value: str | None = None
|
|
"""Optional value transmitted with the tag."""
|
|
|
|
raw: str = ""
|
|
"""Raw entity, if need be to yield it."""
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
return (
|
|
isinstance(other, OpenTagEntity)
|
|
and other.name == self.name
|
|
and other.value == self.value
|
|
)
|
|
|
|
|
|
class CloseTagEntity(NamedTuple):
|
|
"""Closing of a tag closing object for textout BBCode.
|
|
|
|
:param name: The name of the tag that is being closed.
|
|
:param full: The full entity, if need be to yield it.
|
|
"""
|
|
|
|
name: str
|
|
"""Name of the tag that is being closed."""
|
|
|
|
raw: str = ""
|
|
"""Raw entity, if need be to yield it."""
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
return isinstance(other, CloseTagEntity) and other.name == self.name
|
|
|
|
|
|
class SpecialEntity(NamedTuple):
|
|
"""Special characters that could mean the opening or closing of a tag.
|
|
|
|
:param name: The special character(s) for the entity.
|
|
"""
|
|
|
|
value: str
|
|
"""Special character(s) for the entity."""
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
return isinstance(other, SpecialEntity) and other.value == self.value
|
|
|
|
|
|
class NewlineEntity(NamedTuple):
|
|
"""Entity representing a newline."""
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
return isinstance(other, NewlineEntity)
|
|
|
|
|
|
class TextEntity(NamedTuple):
|
|
"""Entity representing raw text."""
|
|
|
|
content: str
|
|
"""Content in the text."""
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
return isinstance(other, TextEntity) and other.content == self.content
|
|
|
|
|
|
NEWLINE_ENTITY_INSTANCE = NewlineEntity()
|
|
|
|
|
|
def get_textout_entity_from_match(
|
|
match: regex.Match,
|
|
) -> NewlineEntity | OpenTagEntity | CloseTagEntity | SpecialEntity | None:
|
|
"""Get a textout entity from the given match.
|
|
|
|
:param match: The full (non-partial) match to yield an entity from.
|
|
:return: The obtained entity, or None if an error has occurred during
|
|
matching.
|
|
"""
|
|
parts = match.groupdict()
|
|
if parts["newline"] is not None:
|
|
return NEWLINE_ENTITY_INSTANCE
|
|
|
|
if parts["bname"] is not None:
|
|
name = parts["bname"]
|
|
value = parts["value"]
|
|
|
|
if len(name) > MAX_TAG_NAME_SIZE or (
|
|
value is not None and len(value) > MAX_TAG_VALUE_SIZE
|
|
):
|
|
return None
|
|
|
|
return OpenTagEntity(
|
|
name=name.casefold(),
|
|
value=value,
|
|
raw=match.group(0),
|
|
)
|
|
|
|
if parts["ename"] is not None:
|
|
name = parts["ename"]
|
|
|
|
if len(name) > MAX_TAG_NAME_SIZE:
|
|
return None
|
|
|
|
return CloseTagEntity(
|
|
name=name.casefold(),
|
|
raw=match.group(0),
|
|
)
|
|
|
|
if parts["sname"] is None: # pragma: no cover
|
|
raise AssertionError("sname should be filled here!")
|
|
|
|
return SpecialEntity(value=parts["sname"])
|
|
|
|
|
|
Entity: TypeAlias = Union[
|
|
OpenTagEntity,
|
|
CloseTagEntity,
|
|
SpecialEntity,
|
|
NewlineEntity,
|
|
TextEntity,
|
|
]
|
|
|
|
|
|
def iter_textout_entities(
|
|
stream_or_string: TextIO | str,
|
|
/,
|
|
) -> Iterator[Entity]:
|
|
"""Iterate over textout entities.
|
|
|
|
:param stream_or_string: The text stream or string to read from.
|
|
:return: The iterator for textout entities and raw text.
|
|
"""
|
|
stream: TextIO | None
|
|
if isinstance(stream_or_string, str):
|
|
stream = StringIO(stream_or_string)
|
|
else:
|
|
stream = stream_or_string
|
|
|
|
buf = "" # Current buffer of unprocessed input.
|
|
text = "" # Text buffer, to avoid consecutive text yields.
|
|
|
|
while True:
|
|
if not buf and stream is not None:
|
|
buf = stream.read(BUFFER_SIZE - len(buf))
|
|
|
|
if not buf:
|
|
break
|
|
|
|
# Try and match a tag.
|
|
result = ENTITY_RE.search(buf, partial=True)
|
|
if not result or not result.group(0):
|
|
text += buf
|
|
buf = ""
|
|
continue
|
|
|
|
# If there is some text, return it.
|
|
start, end = result.span()
|
|
if start > 0:
|
|
text += buf[:start]
|
|
buf = buf[start:]
|
|
|
|
if not result.partial:
|
|
# Result is actually exploitable, we can go on!
|
|
pass
|
|
elif len(buf) >= MAX_ENTITY_SIZE:
|
|
# A partial result cannot be more than the maximum entity size!
|
|
# In such case, maybe if we start later, we can get a full match?
|
|
text += buf[:1]
|
|
buf = buf[1:]
|
|
continue
|
|
else:
|
|
# We need to complete the buffer from here to get a full tag.
|
|
if stream is not None:
|
|
new_data = stream.read(BUFFER_SIZE - len(buf))
|
|
if new_data:
|
|
# We have full data to complete the match, we need to try!
|
|
buf += new_data
|
|
continue
|
|
|
|
# We've reached the end of our stream, we need to continue with
|
|
# what we've got. Maybe if we start later, we can get a full
|
|
# match?
|
|
text += buf[:1]
|
|
buf = buf[1:]
|
|
stream = None
|
|
continue
|
|
|
|
entity = get_textout_entity_from_match(result)
|
|
if entity is None:
|
|
text += buf[:1]
|
|
buf = buf[1:]
|
|
continue
|
|
|
|
if text:
|
|
yield TextEntity(content=text)
|
|
text = ""
|
|
|
|
buf = buf[end - start :]
|
|
yield entity
|
|
|
|
if text:
|
|
yield TextEntity(content=text)
|