2
0
Fork 0
textout/textoutpc/lexer.py

278 lines
7.5 KiB
Python

#!/usr/bin/env python
# *****************************************************************************
# Copyright (C) 2018-2023 Thomas Touhey <thomas@touhey.fr>
# This file is part of the textoutpc project, which is MIT-licensed.
# *****************************************************************************
"""Lexer definition for textoutpc."""
from __future__ import annotations
from collections.abc import Iterator
from io import StringIO
from typing import Any, NamedTuple, TextIO, Union
import regex
from typing_extensions import TypeAlias
__all__ = [
"CloseTagEntity",
"NewlineEntity",
"OpenTagEntity",
"SpecialEntity",
"TextEntity",
"iter_textout_entities",
]
# A tag can basically be one of the following things:
# - a starting tag, looking like [<name>] or [<name>=<attribute>]
# - an ending tag, looking like [/<name>]
# - a special tag (starting or ending), usually one-char (the only
# one currently available is the ` tag).
#
# A tag name is 32 chars at most (at least 1 char).
# A closing tag can have no name, which means that it will close the
# last opened tag automatically.
# A tag attribute is 256 chars at most.
#
# FIXME: Check the sizes.
MAX_TAG_NAME_SIZE: int = 32
MAX_TAG_VALUE_SIZE: int = 256
MAX_ENTITY_SIZE: int = MAX_TAG_NAME_SIZE + MAX_TAG_VALUE_SIZE + 3
BUFFER_SIZE: int = 1024 # Must be more than MAX_ENTITY_SIZE!
ENTITY_RE = regex.compile(
r"""
\[\s*[\\\/] (?P<ename>
(?P<ename_e>
[^\[\]\=]+ (\[(?&ename_e)*\]?)*
| [^\[\]\=]* (\[(?&ename_e)*\]?)+
)*
)
\s?\]
|
\[\s* (?P<bname>
(?P<bname_e>
[^\[\]\=]* (\[(?P&bname_e)*\]?)+
| [^\[\]\=]+ (\[(?P&bname_e)*\]?)*
)+
)
(\s* = \s* (?P<value>
(?P<value_e>
[^\[\]]* (\[(?&value_e)*\]?)+
| [^\[\]]+ (\[(?&value_e)*\]?)*
)*
))?
\s?\]
|
(?P<newline>\n|\r\n|\r)
|
(?P<sname>`)
""",
regex.VERBOSE | regex.DOTALL | regex.MULTILINE,
)
class OpenTagEntity(NamedTuple):
"""Explicit opening of a tag."""
name: str
"""Name of the tag that is being opened."""
value: str | None = None
"""Optional value transmitted with the tag."""
raw: str = ""
"""Raw entity, if need be to yield it."""
def __eq__(self, other: Any) -> bool:
return (
isinstance(other, OpenTagEntity)
and other.name == self.name
and other.value == self.value
)
class CloseTagEntity(NamedTuple):
"""Closing of a tag closing object for textout BBCode.
:param name: The name of the tag that is being closed.
:param full: The full entity, if need be to yield it.
"""
name: str
"""Name of the tag that is being closed."""
raw: str = ""
"""Raw entity, if need be to yield it."""
def __eq__(self, other: Any) -> bool:
return isinstance(other, CloseTagEntity) and other.name == self.name
class SpecialEntity(NamedTuple):
"""Special characters that could mean the opening or closing of a tag.
:param name: The special character(s) for the entity.
"""
value: str
"""Special character(s) for the entity."""
def __eq__(self, other: Any) -> bool:
return isinstance(other, SpecialEntity) and other.value == self.value
class NewlineEntity(NamedTuple):
"""Entity representing a newline."""
def __eq__(self, other: Any) -> bool:
return isinstance(other, NewlineEntity)
class TextEntity(NamedTuple):
"""Entity representing raw text."""
content: str
"""Content in the text."""
def __eq__(self, other: Any) -> bool:
return isinstance(other, TextEntity) and other.content == self.content
NEWLINE_ENTITY_INSTANCE = NewlineEntity()
def get_textout_entity_from_match(
match: regex.Match,
) -> NewlineEntity | OpenTagEntity | CloseTagEntity | SpecialEntity | None:
"""Get a textout entity from the given match.
:param match: The full (non-partial) match to yield an entity from.
:return: The obtained entity, or None if an error has occurred during
matching.
"""
parts = match.groupdict()
if parts["newline"] is not None:
return NEWLINE_ENTITY_INSTANCE
if parts["bname"] is not None:
name = parts["bname"]
value = parts["value"]
if len(name) > MAX_TAG_NAME_SIZE or (
value is not None and len(value) > MAX_TAG_VALUE_SIZE
):
return None
return OpenTagEntity(
name=name.casefold(),
value=value,
raw=match.group(0),
)
if parts["ename"] is not None:
name = parts["ename"]
if len(name) > MAX_TAG_NAME_SIZE:
return None
return CloseTagEntity(
name=name.casefold(),
raw=match.group(0),
)
if parts["sname"] is None: # pragma: no cover
raise AssertionError("sname should be filled here!")
return SpecialEntity(value=parts["sname"])
Entity: TypeAlias = Union[
OpenTagEntity,
CloseTagEntity,
SpecialEntity,
NewlineEntity,
TextEntity,
]
def iter_textout_entities(
stream_or_string: TextIO | str,
/,
) -> Iterator[Entity]:
"""Iterate over textout entities.
:param stream_or_string: The text stream or string to read from.
:return: The iterator for textout entities and raw text.
"""
stream: TextIO | None
if isinstance(stream_or_string, str):
stream = StringIO(stream_or_string)
else:
stream = stream_or_string
buf = "" # Current buffer of unprocessed input.
text = "" # Text buffer, to avoid consecutive text yields.
while True:
if not buf and stream is not None:
buf = stream.read(BUFFER_SIZE - len(buf))
if not buf:
break
# Try and match a tag.
result = ENTITY_RE.search(buf, partial=True)
if not result or not result.group(0):
text += buf
buf = ""
continue
# If there is some text, return it.
start, end = result.span()
if start > 0:
text += buf[:start]
buf = buf[start:]
if not result.partial:
# Result is actually exploitable, we can go on!
pass
elif len(buf) >= MAX_ENTITY_SIZE:
# A partial result cannot be more than the maximum entity size!
# In such case, maybe if we start later, we can get a full match?
text += buf[:1]
buf = buf[1:]
continue
else:
# We need to complete the buffer from here to get a full tag.
if stream is not None:
new_data = stream.read(BUFFER_SIZE - len(buf))
if new_data:
# We have full data to complete the match, we need to try!
buf += new_data
continue
# We've reached the end of our stream, we need to continue with
# what we've got. Maybe if we start later, we can get a full
# match?
text += buf[:1]
buf = buf[1:]
stream = None
continue
entity = get_textout_entity_from_match(result)
if entity is None:
text += buf[:1]
buf = buf[1:]
continue
if text:
yield TextEntity(content=text)
text = ""
buf = buf[end - start :]
yield entity
if text:
yield TextEntity(content=text)