textout/textoutpc/lexer.py

#!/usr/bin/env python
# *****************************************************************************
# Copyright (C) 2018-2023 Thomas Touhey <thomas@touhey.fr>
# This file is part of the textoutpc project, which is MIT-licensed.
# *****************************************************************************
"""Lexer definition for textoutpc."""

from __future__ import annotations

from collections.abc import Iterator
from io import StringIO
from typing import Any, NamedTuple, TextIO, Union

import regex
from typing_extensions import TypeAlias


__all__ = [
    "CloseTagEntity",
    "NewlineEntity",
    "OpenTagEntity",
    "SpecialEntity",
    "TextEntity",
    "iter_textout_entities",
]

# A tag can basically be one of the following things:
# - a starting tag, looking like [<name>] or [<name>=<attribute>]
# - an ending tag, looking like [/<name>]
# - a special tag (starting or ending), usually one-char (the only
#   one currently available is the ` tag).
#
# A tag name is 32 chars at most (at least 1 char).
# A closing tag can have no name, which means that it will close the
# last opened tag automatically.
# A tag attribute is 256 chars at most.
#
# FIXME: Check the sizes.
MAX_TAG_NAME_SIZE: int = 32
MAX_TAG_VALUE_SIZE: int = 256
MAX_ENTITY_SIZE: int = MAX_TAG_NAME_SIZE + MAX_TAG_VALUE_SIZE + 3
BUFFER_SIZE: int = 1024  # Must be more than MAX_ENTITY_SIZE!
ENTITY_RE = regex.compile(
    r"""
        \[\s*[\\\/] (?P<ename>
            (?P<ename_e>
                [^\[\]\=]+ (\[(?&ename_e)*\]?)*
                | [^\[\]\=]* (\[(?&ename_e)*\]?)+
            )*
        )
        \s?\]
    |
        \[\s* (?P<bname>
            (?P<bname_e>
                [^\[\]\=]* (\[(?P&bname_e)*\]?)+
                | [^\[\]\=]+ (\[(?P&bname_e)*\]?)*
            )+
        )
        (\s* = \s* (?P<value>
            (?P<value_e>
                [^\[\]]* (\[(?&value_e)*\]?)+
                | [^\[\]]+ (\[(?&value_e)*\]?)*
            )*
        ))?
        \s?\]
    |
        (?P<newline>\n|\r\n|\r)
    |
        (?P<sname>`)
    """,
    regex.VERBOSE | regex.DOTALL | regex.MULTILINE,
)


class OpenTagEntity(NamedTuple):
    """Explicit opening of a tag."""

    name: str
    """Name of the tag that is being opened."""

    value: str | None = None
    """Optional value transmitted with the tag."""

    raw: str = ""
    """Raw entity, if need be to yield it."""

    def __eq__(self, other: Any) -> bool:
        return (
            isinstance(other, OpenTagEntity)
            and other.name == self.name
            and other.value == self.value
        )


class CloseTagEntity(NamedTuple):
    """Closing of a tag closing object for textout BBCode.

    :param name: The name of the tag that is being closed.
    :param full: The full entity, if need be to yield it.
    """

    name: str
    """Name of the tag that is being closed."""

    raw: str = ""
    """Raw entity, if need be to yield it."""

    def __eq__(self, other: Any) -> bool:
        return isinstance(other, CloseTagEntity) and other.name == self.name


class SpecialEntity(NamedTuple):
    """Special characters that could mean the opening or closing of a tag.

    :param name: The special character(s) for the entity.
    """

    value: str
    """Special character(s) for the entity."""

    def __eq__(self, other: Any) -> bool:
        return isinstance(other, SpecialEntity) and other.value == self.value


class NewlineEntity(NamedTuple):
    """Entity representing a newline."""

    def __eq__(self, other: Any) -> bool:
        return isinstance(other, NewlineEntity)


class TextEntity(NamedTuple):
    """Entity representing raw text."""

    content: str
    """Content in the text."""

    def __eq__(self, other: Any) -> bool:
        return isinstance(other, TextEntity) and other.content == self.content


NEWLINE_ENTITY_INSTANCE = NewlineEntity()


def get_textout_entity_from_match(
    match: regex.Match,
) -> NewlineEntity | OpenTagEntity | CloseTagEntity | SpecialEntity | None:
    """Get a textout entity from the given match.

    :param match: The full (non-partial) match to yield an entity from.
    :return: The obtained entity, or None if an error has occurred during
        matching.
    """
    parts = match.groupdict()
    if parts["newline"] is not None:
        return NEWLINE_ENTITY_INSTANCE

    if parts["bname"] is not None:
        name = parts["bname"]
        value = parts["value"]

        if len(name) > MAX_TAG_NAME_SIZE or (
            value is not None and len(value) > MAX_TAG_VALUE_SIZE
        ):
            return None

        return OpenTagEntity(
            name=name.casefold(),
            value=value,
            raw=match.group(0),
        )

    if parts["ename"] is not None:
        name = parts["ename"]

        if len(name) > MAX_TAG_NAME_SIZE:
            return None

        return CloseTagEntity(
            name=name.casefold(),
            raw=match.group(0),
        )

    if parts["sname"] is None:  # pragma: no cover
        raise AssertionError("sname should be filled here!")

    return SpecialEntity(value=parts["sname"])


Entity: TypeAlias = Union[
    OpenTagEntity,
    CloseTagEntity,
    SpecialEntity,
    NewlineEntity,
    TextEntity,
]


def iter_textout_entities(
    stream_or_string: TextIO | str,
    /,
) -> Iterator[Entity]:
    """Iterate over textout entities.

    :param stream_or_string: The text stream or string to read from.
    :return: The iterator for textout entities and raw text.
    """
    stream: TextIO | None
    if isinstance(stream_or_string, str):
        stream = StringIO(stream_or_string)
    else:
        stream = stream_or_string

    buf = ""  # Current buffer of unprocessed input.
    text = ""  # Text buffer, to avoid consecutive text yields.

    while True:
        if not buf and stream is not None:
            buf = stream.read(BUFFER_SIZE - len(buf))

        if not buf:
            break

        # Try and match a tag.
        result = ENTITY_RE.search(buf, partial=True)
        if not result or not result.group(0):
            text += buf
            buf = ""
            continue

        # If there is some text, return it.
        start, end = result.span()
        if start > 0:
            text += buf[:start]
            buf = buf[start:]

        if not result.partial:
            # Result is actually exploitable, we can go on!
            pass
        elif len(buf) >= MAX_ENTITY_SIZE:
            # A partial result cannot be more than the maximum entity size!
            # In such case, maybe if we start later, we can get a full match?
            text += buf[:1]
            buf = buf[1:]
            continue
        else:
            # We need to complete the buffer from here to get a full tag.
            if stream is not None:
                new_data = stream.read(BUFFER_SIZE - len(buf))
                if new_data:
                    # We have full data to complete the match, we need to try!
                    buf += new_data
                    continue

            # We've reached the end of our stream, we need to continue with
            # what we've got. Maybe if we start later, we can get a full
            # match?
            text += buf[:1]
            buf = buf[1:]
            stream = None
            continue

        entity = get_textout_entity_from_match(result)
        if entity is None:
            text += buf[:1]
            buf = buf[1:]
            continue

        if text:
            yield TextEntity(content=text)
            text = ""

        buf = buf[end - start :]
        yield entity

    if text:
        yield TextEntity(content=text)