TeX/src/TeX.y

506 lines
13 KiB
Plaintext
Raw Permalink Normal View History

%{
2019-06-12 18:53:50 +02:00
#include <TeX/config.h>
#include <TeX/node.h>
#include <TeX/flow.h>
#include <TeX/env.h>
#include <stdlib.h>
#include <string.h>
2019-06-12 18:53:50 +02:00
/* yylex(): The lexer, as usual */
int yylex(void);
2019-06-12 18:53:50 +02:00
/* yyerror(): The error function */
void yyerror(char const *message);
//---
2019-06-12 18:53:50 +02:00
// Environment stack management
//---
2019-06-12 18:53:50 +02:00
/* Environment stack and current environment's index */
static struct TeX_Env *env_stack[TEX_ENV_DEPTH] = { NULL };
static int env_index = 0;
2019-06-12 18:53:50 +02:00
/* Current environment and its type */
#define env (env_stack[env_index])
2019-06-12 18:53:50 +02:00
void env_push(char const *type)
{
2019-06-12 18:53:50 +02:00
struct TeX_Env *new_env = NULL;
2019-06-12 18:53:50 +02:00
if(!strcmp(type, "primary"))
new_env = TeX_env_primary();
if(!strcmp(type, "matrix"))
2019-06-18 03:27:04 +02:00
new_env = TeX_env_matrix();
2019-06-12 18:53:50 +02:00
/* If new_env is NULL... still insert it. This is more likely to cause
an error, and avoids asymmetry with pops */
2019-06-12 18:53:50 +02:00
env_index++;
env = new_env;
}
2019-06-12 18:53:50 +02:00
struct TeX_Env *env_pop(void)
{
2019-06-12 18:53:50 +02:00
struct TeX_Env *ret = env;
env_index--;
return ret;
}
2019-06-12 18:53:50 +02:00
//---
// Extra node allocation functions
//---
2019-06-12 18:53:50 +02:00
/* mknode_f(): Make a function node with a flow argument
@name Command name; intended for built-in nodes only
@flow Flow argument
Returns a node for command [name] with as argument [flow]. */
2019-06-12 18:53:50 +02:00
struct TeX_Node *mknode_f(char const *name, struct TeX_Flow *flow)
{
2019-06-12 18:53:50 +02:00
struct TeX_Node *node = TeX_node_command(name);
return TeX_node_add_arg(node, flow);
}
2019-06-12 18:53:50 +02:00
/* mknode_t(): Make a function node with a text argument */
struct TeX_Node *mknode_t(char const *name, char const *text)
{
struct TeX_Flow *flow = TeX_flow_add_node(NULL, TeX_node_text(text));
return mknode_f(name, flow);
}
%}
%define api.value.type union
2019-06-12 18:53:50 +02:00
/* Basic text and commands */
%token <char const *> TEXT
%token <char const *> COMMAND
%token <char const *> COMMAND_ABS
/* Special tokens for environments */
%token ENV_BEGIN
%token ENV_END
%left '^' '_'
%type <struct TeX_Flow *> flow
%type <struct TeX_Node *> node
%type <struct TeX_Node *> node_abs
2019-06-12 18:53:50 +02:00
%type <struct TeX_Node *> env_node
%type <struct TeX_Env *> env_end
%%
2019-06-12 18:53:50 +02:00
env:
%empty { }
| env node { env->add_node(env, $2); }
2019-06-18 03:27:04 +02:00
| env '&' { env->add_separator(env); }
| env '\\' { env->add_break(env); }
flow:
%empty { $$ = NULL; }
2019-06-12 18:53:50 +02:00
| flow node { $$ = TeX_flow_add_node($1, $2); }
node:
2019-06-12 18:53:50 +02:00
TEXT { $$ = TeX_node_text($1); }
| COMMAND { $$ = TeX_node_command($1); }
| node_abs TEXT { $$ = TeX_node_absorb($1, $2); }
| node '{' flow '}' { $$ = TeX_node_add_arg($1, $3); }
/* Special shortcuts for the superscript and subscript classes */
| '^' TEXT { $$ = mknode_t("\\sup", $2); }
| '_' TEXT { $$ = mknode_t("\\sub", $2); }
| '^' '{' flow '}' { $$ = mknode_f("\\sup", $3); }
| '_' '{' flow '}' { $$ = mknode_f("\\sub", $3); }
2019-06-12 18:53:50 +02:00
/* Environments */
| env_node { $$ = $1; }
node_abs:
2019-06-12 18:53:50 +02:00
COMMAND_ABS { $$ = TeX_node_command($1); }
env_node:
/* TODO: Add TeX_mknode_env() */
2019-06-18 03:27:04 +02:00
env_begin env env_end { $$ = TeX_node_env($3); }
2019-06-12 18:53:50 +02:00
env_begin:
ENV_BEGIN '{' TEXT '}' { env_push($3); }
env_end:
ENV_END '{' TEXT '}' { $$ = env_pop(); }
%%
//---
// The lexer
//
// The lexical analysis is actually the subtle part in this program,
// because we want to preserve text sections without cutting them to avoid
// merging string tokens later on.
//
// The program below is a lexer with some hybrid parser features that
// accomplishes this task.
// - It has a lookahead character and never retracts.
// - It has a notion of context (notably single-character mode).
// This machinery, however, can only be edited by hand.
//
// The good side of this is that it's much more efficient because we don't
// break sections without commands, so there are less allocations and less
// tokens to parse.
//---
/* Character source (input formula) */
2019-06-12 18:53:50 +02:00
static char const *lex;
/* Accumulator. This buffer contains text that will be emitted in the next
token, more precisely everything between the start of the token and [lex]
(the [lex] pointer will move forward during lexing), but with escape
characters decoded.
Note that this buffer is static *and* unique, so every token will make its
way through the accumulator. As a consequence, we must apply a parsing rule
that copies the accumulated string every time we flush the buffer. If a
parsing rule has several parameters with data in the accumulator, then all
except the last will have their data overridden before the semantic rule is
executed. */
static char acc_buffer[TEX_LEXER_BUFSIZE];
/* Position of the next free character in [acc_buffer] */
static char *acc;
/* enum state - Lexer automaton states.
The automaton state represents what has been discovered at the previous
step; the state is changed to store instructions for the next character
input round. Often, when a character is read, previously-lexed input is
emitted and the character is stored in the accumulator. Possibly the state
is changed. */
static enum {
/* Reached end-of-file (continuously emits $end tokens) */
eof = 0,
/* When reading text sections to be displayed on the screen, possibly
with escapes - this is the part we don't want to cut. */
text,
/* When reading a command name after a '\' */
command,
/* The following states are transitioned into when their characters are
read from input. The associated token will only be emitted in the
next character input round. */
superscript = '^',
subscript = '_',
lbrace = '{',
rbrace = '}',
2019-06-18 03:27:04 +02:00
break_symbol = '\\',
separator = '&',
} state = text;
/* Single-character mode. When a command name, '^' or '_' is not followed by a
'{', the argument is taken to be the next input character. This mode alters
the behavior of the [text] state (mainly) to emit a token at the next
character instead of storing data in the accumulator. */
int single;
/* Lookahead symbol. The combination of the lookahead character with the
delayed-emission described in [enum state] gives this lexer two characters
to make decisions. For example, when lexing "ab^2", at the third character
input round:
* '^' is read from the input
* "ab" is released as a TEXT token and the accumulator is emptied
* '2' is seen as lookahead and single-character mode is activated */
static int la;
2019-06-12 18:53:50 +02:00
/* forward(): Maybe return
Returns the value of @expr if it's nonnegative; does nothing otherwise. */
#define forward(expr) do { \
int v = expr; \
if(v >= 0) return v; \
} while(0)
2019-06-12 18:53:50 +02:00
/* lexer_init(): TODO: Document lexer_init() */
void lexer_init(char const *formula)
{
acc = acc_buffer;
/* When the formula is empty, don't let the lexer read a lookahead! */
if(!formula || !formula[0])
{
lex = NULL;
state = eof;
la = 0;
single = 0;
return;
}
lex = formula + 1;
state = text;
la = formula[0];
single = 0;
}
2019-06-12 18:53:50 +02:00
/* release(): Release the lexer buffer as a textual token
This function returns produces a text token of the requested types using the
contents of the lexer buffer, but only if the buffer is not empty. Thus the
call must be wrapped into forward() and not return if there is a fallback
action.
@token Requested token type
Returns a nonnegative token number if the buffer is not empty, or -1. */
static int release(int token)
{
if(acc == acc_buffer) return -1;
*acc++ = 0;
acc = acc_buffer;
/* After all we don't need to switch on token */
/* WARN: may be fragile, look for appropriate flags */
yylval.TEXT = yylval.COMMAND = yylval.COMMAND_ABS = acc_buffer;
return token;
}
2019-06-12 18:53:50 +02:00
/* accumulate(): Accumulate characters in the lexer buffer
Adds a new character @c to the lexer buffer. If the buffer is full or if
single-character mode is active, emits a token and empties the buffer. For
this return to work, the call to accumulate() must be wrapped in forward().
@c Character to add (1 byte)
Returns a nonnegative token number if one is emitted, -1 otherwise. */
static int accumulate(int c)
{
*acc++ = c;
/* Send a token if the buffer is full or single-character mode is on */
if(acc >= acc_buffer + TEX_LEXER_BUFSIZE - 1 || single)
{
single = 0;
switch(state)
{
case text: return release(TEXT);
case command: return release(COMMAND);
default: break;
}
}
/* Continue lexing for now */
return -1;
}
2019-06-12 18:53:50 +02:00
/* acccmp(): String comparison with the accumulator
Like strcmp(), but uses the non-NUL-terminated accumulator as one input. */
2019-06-12 18:53:50 +02:00
static int acccmp(char const *str)
{
return strncmp(acc_buffer, str, acc - acc_buffer);
}
2019-06-12 18:53:50 +02:00
/* lexer_text(): Execute a step of lexing from the text state
In text state, we are accumulating characters as long as possible without
releasing tokens. Longer chunks of text render faster on monochrome
calculators and need less memory.
This mode is exited whenever a metacharacter, that is '{', '}', '^' or '_'.
The backslash character '\' is treated with more care because escaped
metacharacters such as '\{' still count as text and don't need us to release
the accumulator.
Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_text(void)
{
/* Feed lexer: this is safe thanks because I heed for c = EOF */
int c = la;
if(!c) { state = eof; return release(TEXT); }
la = *lex++;
/* Escapes and command names */
if(c == '\\')
{
/* Command name: release current buffer and move */
if(la >= 'a' && la <= 'z')
{
state = command;
return release(TEXT);
}
2019-06-18 03:27:04 +02:00
/* Breaks */
if(la == '\\')
{
la = *lex++;
state = '\\';
return release(TEXT);
}
/* Escaped character: accumulate lookahead and feed lexer.
Feeding is safe because current lookahead is not EOF */
2019-06-18 03:27:04 +02:00
if(strchr("{}^_&", la))
{
c = la;
la = *lex++;
/* Intentional fall-through */
}
/* TODO: Emit a warning in an "else" clause? */
return accumulate(c);
}
/* Opening and closing braces are always syntactic elements */
2019-06-18 03:27:04 +02:00
else if(c == '{' || c == '}' || c == '&')
{
state = c;
return release(TEXT);
}
/* Superscript and subscript: heed for various input modes */
else if(c == '^' || c == '_')
{
/* In all cases, prepare to emit c at next lexing round */
state = c;
/* If the next character is not '{', then we don't have a {}
for the argument; enable single-character mode */
if(la != '{') single = 1;
/* Then emit what was already in the buffer, as text */
return release(TEXT);
}
/* Accumulate the current character in the buffer until it's full */
return accumulate(c);
}
2019-06-12 18:53:50 +02:00
/* lexer_command(): Execute a step of lexing from the command state
This state is transitioned into when a '\' followed by a letter is
encountered. The lexer remains there until the end of the command, signaled
by a non-letter character, is reached.
At this point, the lexer can either enter the text mode again and wait for a
'{' to start arguments, or enter the text mode with the single-character
argument flag so any character that comes next is treated as the argument.
The original TeX does this conditionally, using single-character arguments
if and only if the argument does not start with a '{'. However, in this
program, because the number of arguments is not known at parse time, this
would make dumb commands such as '\alpha' gobble an unbounded amount of
arguments. So all commands use brace-only arguments, except for a designated
set. Typically this includes \left and \right which are mainly used without
braces in practice.
Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_command(void)
{
/* Feed lexer; this is safe because I heed for la = EOF */
int c = la;
if(!c) { state = eof; return release(COMMAND); }
la = *lex++;
/* In this state, c is always in the range a .. z */
int ret = accumulate(c);
/* Continue if next character is a command continuation */
if(la >= 'a' && la <= 'z') return ret;
/* Otherwise, release command name */
state = text;
/* Absorbing commands include "left" and "right" */
if(!acccmp("left") || !acccmp("right"))
{
single = 1;
return release(COMMAND_ABS);
}
2019-06-18 03:27:04 +02:00
/* Special environment commands */
if(!acccmp("begin"))
return release(ENV_BEGIN);
if(!acccmp("end"))
return release(ENV_END);
return release(COMMAND);
}
2019-06-12 18:53:50 +02:00
/* yylex(): The lexer
Returns the token type of the next token in the string initialized by the
last call to parse_start(). */
int yylex(void)
{
while(1)
{
if(state == text) forward(lexer_text());
else if(state == command) forward(lexer_command());
else break;
}
/* End-of-File: give up */
if(state == eof) return 0;
/* Character-specific states: feed and return state number */
int c = state;
state = text;
return c;
}
//---
// Parser interface
//---
2019-06-12 18:53:50 +02:00
/* parse(): Parse into a TeX environment */
struct TeX_Env *parse(char const *str)
{
2019-06-12 18:53:50 +02:00
/* Initialize lexer to run on a string */
lexer_init(str);
2019-06-12 18:53:50 +02:00
/* Push a primary environment */
env_push("primary");
int x = yyparse();
2019-06-12 18:53:50 +02:00
/* Pop the primary environment out of stack */
return x ? NULL: env_pop();
}
2019-06-12 18:53:50 +02:00
//---
// Printing and errors
//---
2019-06-12 18:53:50 +02:00
#ifndef TEX_PRINT
2019-06-19 17:52:09 +02:00
void yyerror(TEX_UNUSED char const *error)
{
}
2019-06-12 18:53:50 +02:00
#else
#include <stdio.h>
#include <TeX/print.h>
2019-06-12 18:53:50 +02:00
void yyerror(char const *error)
{
2019-06-12 18:53:50 +02:00
fprintf(stderr, "Parsing failed: %s\n", error);
}
2019-06-12 18:53:50 +02:00
/* TeX_print_lex(): Print the result of lexing */
void TeX_print_lex(char const *formula)
{
lexer_init(formula);
int token;
do
{
token = yylex();
printf("%-3d ", token);
2019-06-12 18:53:50 +02:00
if(token > 0x20 && token < 0x7f)
printf("%c", token);
if(token == TEXT)
printf("TEXT %s", yylval.TEXT);
if(token == COMMAND)
printf("COMMAND %s", yylval.COMMAND);
if(token == COMMAND_ABS)
printf("COMMAND_ABS %s", yylval.COMMAND_ABS);
if(token == ENV_BEGIN)
printf("ENV_BEGIN");
if(token == ENV_END)
printf("ENV_END");
2019-06-12 18:53:50 +02:00
printf("\n");
}
2019-06-12 18:53:50 +02:00
while(token != 0);
}
2019-06-12 18:53:50 +02:00
#endif /* TEX_PRINT */