TeX/src/TeX.y

590 lines
16 KiB
Plaintext

/* TODO Notes for the calculator version.
Don't use GLR (= non-deterministic forks)
Provide <alloca.h>, <malloc.h>, <stddef.h>, <stdlib.h>
To access semantic value of an object, use "-> value"
To suppress locations on-calc: #define YYLLOC_DEFAULT(Cur, Rhs, N)
(or just don't pass --locations) */
%{
#include <TeX/TeX.h>
#include <TeX/parser.h>
#include <TeX/structure.h>
#include <stdlib.h>
#include <string.h>
/* yylex() - The lexer, as usual */
int yylex(void);
/* yyerror() - The error function */
void yyerror(const char *);
static struct TeX_Flow *result = NULL;
//---
// Node allocation functions
//---
/* mkflow_cons() - Add a new node at the right edge of a flow
@flow Updated flow
@node New node to add
Returns the new flow, which my be a different pointer if [flow] is NULL. */
static struct TeX_Flow *mkflow_cons(struct TeX_Flow*flow, struct TeX_Node*node)
{
if(!node) return flow;
if(!flow)
{
flow = calloc(1, sizeof *flow);
if(!flow) { yyerror("[[out of memory]]"); return NULL; }
flow->first = flow->last = node;
return flow;
}
flow->last->next = node;
flow->last = node;
return flow;
}
/* mknode_text() - Create a new text container node
@str 8-byte contained string
Returns a new node with an internal format for the string. The original
string may disappear. */
static struct TeX_Node *mknode_text(const char *str)
{
struct TeX_Node *node = calloc(1, sizeof *node);
if(!node) { yyerror("[[out of memory]]"); return NULL; }
/* TODO: Don't use strdup(); use a format conversion instead */
node->text = malloc(strlen(str) + 1);
if(!node->text) {
yyerror("[[out of memory]]"); free(node); return NULL; }
strcpy((void *)node->text, str);
node->type = TEX_NODECLASS_TEXT;
return node;
}
/* mknode_command() - Create a new command node
@name Command name
This function looks up the command name in the class table and returns NULL
if the command is not available. Otherwise it returns a new node without
arguments for this command. */
static struct TeX_Node *mknode_command(const char *name)
{
/* Get the class id from the class table */
int type = class_find(name);
if(type < 0) { yyerror("[[ unknown command ]]"); return NULL; }
struct TeX_Node *node = calloc(1, sizeof *node);
if(!node) { yyerror("[[out of memory]]"); return NULL; }
node->type = type;
node->next = NULL;
return node;
}
/* mknode_arg() - Add an argument to a command node
@node Node to add an argument to
@arg Argument flow
Always returns node. If node is not a plain text node and has at least one
free argument slot, then arg is added at the first free slot. Otherwise,
arg is freed and node is returned unchanged. */
static struct TeX_Node *mknode_arg(struct TeX_Node *node, struct TeX_Flow *arg)
{
if(!node || !arg) return node;
/* Drop the argument if it's for a plain text node */
if(node->type == TEX_NODECLASS_TEXT)
{
yyerror("[[dropping argument of plain text node]]");
TeX_free(arg);
return node;
}
/* Otherwise look up a free slot in the argument array of node */
int i = 0;
while(i < TEX_MAX_CHILDREN && node->args[i]) i++;
if(i >= TEX_MAX_CHILDREN)
{
yyerror("[[too much arguments for node]]");
TeX_free(arg);
}
else node->args[i] = arg;
return node;
}
/* mknode_absarg() - Add an absorbed argument to a command node
@node Command node
@arg Text argument
For special cases such as left/right, sets the subtype, otherwise creates
a text flow as usual. */
static struct TeX_Node *mknode_absarg(struct TeX_Node *node, char const *text)
{
char const *class = "";
if(node->type != TEX_NODECLASS_TEXT)
class = TeX_table[node->type - 1].name;
if(!strcmp(class, "left") || !strcmp(class, "right"))
{
node->subtype = text[0];
return node;
}
return mknode_arg(node, mkflow_cons(NULL, mknode_text(text)));
}
/* mknode_f() - Make a function node with a flow argument
@name Command name; intended for built-in nodes only
@flow Flow argument
Returns a node for command [name] with as argument [flow]. */
static struct TeX_Node *mknode_f(const char *name, struct TeX_Flow *flow)
{
struct TeX_Node *node = mknode_command(name);
return mknode_arg(node, flow);
}
/* mknode_t() - Make a function node with a text argument */
#define mknode_t(name, text) \
mknode_f((name), mkflow_cons(NULL, mknode_text((text))))
%}
%define api.value.type union
%token <const char *> TEXT
%token <const char *> COMMAND
%token <const char *> COMMAND_ABS
%left '^' '_'
%type <struct TeX_Flow *> flow
%type <struct TeX_Node *> node
%type <struct TeX_Node *> node_abs
%%
main:
flow { result = $1; }
flow:
%empty { $$ = NULL; }
| flow node { $$ = mkflow_cons($1, $2); }
node:
TEXT { $$ = mknode_text($1); }
| COMMAND { $$ = mknode_command($1); }
| node_abs { $$ = $1; }
| node '{' flow '}' { $$ = mknode_arg($1, $3); }
/* Special shortcuts for the superscript and subscript classes */
| '^' TEXT { $$ = mknode_t("\\sup", $2); }
| '_' TEXT { $$ = mknode_t("\\sub", $2); }
| '^' '{' flow '}' { $$ = mknode_f("\\sup", $3); }
| '_' '{' flow '}' { $$ = mknode_f("\\sub", $3); }
/* TODO: shift/reduce for [COMMAND_ABS TEXT] - could split in two nodes */
node_abs:
COMMAND_ABS { $$ = mknode_command($1); }
| node_abs TEXT { $$ = mknode_absarg($1, $2); }
%%
//---
// The lexer
//
// The lexical analysis is actually the subtle part in this program,
// because we want to preserve text sections without cutting them to avoid
// merging string tokens later on.
//
// The program below is a lexer with some hybrid parser features that
// accomplishes this task.
// - It has a lookahead character and never retracts.
// - It has a notion of context (notably single-character mode).
// This machinery, however, can only be edited by hand.
//
// The good side of this is that it's much more efficient because we don't
// break sections without commands, so there are less allocations and less
// tokens to parse.
//---
/* Character source (input formula) */
static const char *lex;
/* Accumulator. This buffer contains text that will be emitted in the next
token, more precisely everything between the start of the token and [lex]
(the [lex] pointer will move forward during lexing), but with escape
characters decoded.
Note that this buffer is static *and* unique, so every token will make its
way through the accumulator. As a consequence, we must apply a parsing rule
that copies the accumulated string every time we flush the buffer. If a
parsing rule has several parameters with data in the accumulator, then all
except the last will have their data overridden before the semantic rule is
executed. */
static char acc_buffer[TEX_LEXER_BUFSIZE];
/* Position of the next free character in [acc_buffer] */
static char *acc;
/* enum state - Lexer automaton states.
The automaton state represents what has been discovered at the previous
step; the state is changed to store instructions for the next character
input round. Often, when a character is read, previously-lexed input is
emitted and the character is stored in the accumulator. Possibly the state
is changed. */
static enum {
/* Reached end-of-file (continuously emits $end tokens) */
eof = 0,
/* When reading text sections to be displayed on the screen, possibly
with escapes - this is the part we don't want to cut. */
text,
/* When reading a command name after a '\' */
command,
/* The following states are transitioned into when their characters are
read from input. The associated token will only be emitted in the
next character input round. */
superscript = '^',
subscript = '_',
lbrace = '{',
rbrace = '}',
} state = text;
/* Single-character mode. When a command name, '^' or '_' is not followed by a
'{', the argument is taken to be the next input character. This mode alters
the behavior of the [text] state (mainly) to emit a token at the next
character instead of storing data in the accumulator. */
int single;
/* Lookahead symbol. The combination of the lookahead character with the
delayed-emission described in [enum state] gives this lexer two characters
to make decisions. For example, when lexing "ab^2", at the third character
input round:
* '^' is read from the input
* "ab" is released as a TEXT token and the accumulator is emptied
* '2' is seen as lookahead and single-character mode is activated */
static int la;
/* forward() - Maybe return
Returns the value of @expr if it's nonnegative; does nothing otherwise. */
#define forward(expr) do { \
int v = expr; \
if(v >= 0) return v; \
} while(0)
/* lexer_init() - TODO: Document lexer_init() */
void lexer_init(const char *formula)
{
acc = acc_buffer;
/* When the formula is empty, don't let the lexer read a lookahead! */
if(!formula || !formula[0])
{
lex = NULL;
state = eof;
la = 0;
single = 0;
return;
}
lex = formula + 1;
state = text;
la = formula[0];
single = 0;
}
/* release() - Release the lexer buffer as a textual token
This function returns produces a text token of the requested types using the
contents of the lexer buffer, but only if the buffer is not empty. Thus the
call must be wrapped into forward() and not return if there is a fallback
action.
@token Requested token type
Returns a nonnegative token number if the buffer is not empty, or -1. */
static int release(int token)
{
if(acc == acc_buffer) return -1;
*acc++ = 0;
acc = acc_buffer;
/* After all we don't need to switch on token */
/* WARN: may be fragile, look for appropriate flags */
yylval.TEXT = yylval.COMMAND = yylval.COMMAND_ABS = acc_buffer;
return token;
}
/* accumulate() - Accumulate characters in the lexer buffer
Adds a new character @c to the lexer buffer. If the buffer is full or if
single-character mode is active, emits a token and empties the buffer. For
this return to work, the call to accumulate() must be wrapped in forward().
@c Character to add (1 byte)
Returns a nonnegative token number if one is emitted, -1 otherwise. */
static int accumulate(int c)
{
*acc++ = c;
/* Send a token if the buffer is full or single-character mode is on */
if(acc >= acc_buffer + TEX_LEXER_BUFSIZE - 1 || single)
{
single = 0;
switch(state)
{
case text: return release(TEXT);
case command: return release(COMMAND);
default: break;
}
}
/* Continue lexing for now */
return -1;
}
/* acccmp() - String comparison with the accumulator
Like strcmp(), but uses the non-NUL-terminated accumulator as one input. */
static int acccmp(const char *str)
{
return strncmp(acc_buffer, str, acc - acc_buffer);
}
/* lexer_text() - Execute a step of lexing from the text state
In text state, we are accumulating characters as long as possible without
releasing tokens. Longer chunks of text render faster on monochrome
calculators and need less memory.
This mode is exited whenever a metacharacter, that is '{', '}', '^' or '_'.
The backslash character '\' is treated with more care because escaped
metacharacters such as '\{' still count as text and don't need us to release
the accumulator.
Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_text(void)
{
/* Feed lexer: this is safe thanks because I heed for c = EOF */
int c = la;
if(!c) { state = eof; return release(TEXT); }
la = *lex++;
/* Escapes and command names */
if(c == '\\')
{
/* Command name: release current buffer and move */
if(la >= 'a' && la <= 'z')
{
state = command;
return release(TEXT);
}
/* Escaped character: accumulate lookahead and feed lexer.
Feeding is safe because current lookahead is not EOF */
if(strchr("\\{}^_", la))
{
c = la;
la = *lex++;
/* Intentional fall-through */
}
/* TODO: Emit a warning in an "else" clause? */
return accumulate(c);
}
/* Opening and closing braces are always syntactic elements */
else if(c == '{' || c == '}')
{
state = c;
return release(TEXT);
}
/* Superscript and subscript: heed for various input modes */
else if(c == '^' || c == '_')
{
/* In all cases, prepare to emit c at next lexing round */
state = c;
/* If the next character is not '{', then we don't have a {}
for the argument; enable single-character mode */
if(la != '{') single = 1;
/* Then emit what was already in the buffer, as text */
return release(TEXT);
}
/* Accumulate the current character in the buffer until it's full */
return accumulate(c);
}
/* lexer_command() - Execute of step of lexing from the command state
This state is transitioned into when a '\' followed by a letter is
encountered. The lexer remains there until the end of the command, signaled
by a non-letter character, is reached.
At this point, the lexer can either enter the text mode again and wait for a
'{' to start arguments, or enter the text mode with the single-character
argument flag so any character that comes next is treated as the argument.
The original TeX does this conditionally, using single-character arguments
if and only if the argument does not start with a '{'. However, in this
program, because the number of arguments is not known at parse time, this
would make dumb commands such as '\alpha' gobble an unbounded amount of
arguments. So all commands use brace-only arguments, except for a designated
set. Typically this includes \left and \right which are mainly used without
braces in practice.
Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_command(void)
{
/* Feed lexer; this is safe because I heed for la = EOF */
int c = la;
if(!c) { state = eof; return release(COMMAND); }
la = *lex++;
/* In this state, c is always in the range a .. z */
int ret = accumulate(c);
/* Continue if next character is a command continuation */
if(la >= 'a' && la <= 'z') return ret;
/* Otherwise, release command name */
state = text;
/* Absorbing commands include "left" and "right" */
if(!acccmp("left") || !acccmp("right"))
{
single = 1;
return release(COMMAND_ABS);
}
return release(COMMAND);
}
/* yylex() - The lexer
Returns the token type of the next token in the string initialized by the
last call to parse_start(). */
int yylex(void)
{
while(1)
{
if(state == text) forward(lexer_text());
else if(state == command) forward(lexer_command());
else break;
}
/* End-of-File: give up */
if(state == eof) return 0;
/* Character-specific states: feed and return state number */
int c = state;
state = text;
return c;
}
//---
// Parser interface
//---
/* parse_start(): Configure parser to run on a string */
void parse_start(const char *str)
{
lexer_init(str);
}
/* parse(): Parse into a TeX flow */
struct TeX_Flow *parse(void)
{
int x = yyparse();
return x ? NULL: result;
}
#ifdef TEX_DEBUG
#include <stdio.h>
void yyerror(const char *error)
{
fprintf(stderr, "Parsing failed: %s\n", error);
}
#else /* TEX_DEBUG */
void yyerror(__attribute__((unused)) const char *error)
{
}
#endif /* TEX_DEBUG */
//---
// Debugging functions
//---
#ifdef TEX_DEBUG
#define GRAY "\e[30;1m"
#define END "\e[0m"
/* TeX_debug_lex(): Display the result of lexing on stdout */
void TeX_debug_lex(const char *formula)
{
lexer_init(formula);
int token;
do
{
token = yylex();
printf("%-3d ", token);
if(strchr("{}^_", token)) printf("%c", token);
if(token == 258) printf("TEXT %s", yylval.TEXT);
if(token == 259) printf("COMMAND %s", yylval.COMMAND);
if(token == 260) printf("COMMAND_ABS %s", yylval.COMMAND_ABS);
printf("\n");
}
while(token != 0);
}
/* TeX_debug_node(): Recursively display the structure of a node */
void TeX_debug_node(struct TeX_Node *node, int indent)
{
printf("%*s", indent, "");
if(!node) { puts("node (null)"); return; }
if(node->type == TEX_NODECLASS_TEXT)
{
printf("\"%s\"", node->text);
printf(GRAY " %dx%d,%d %+d%+d" END "\n", node->width,
node->height, node->line, node->x, node->l);
}
else
{
printf("<%s>", TeX_table[node->type - 1].name);
if(node->subtype)
printf(" subtype '%c'", node->subtype);
printf(" "GRAY "%dx%d,%d %+d%+d" END "\n", node->width,
node->height, node->line, node->x, node->l);
for(int i = 0; i < TEX_MAX_CHILDREN && node->args[i]; i++)
TeX_debug_flow(node->args[i], indent + 4);
}
}
/* TeX_debug_flow(): Recursively display the structure of a flow */
void TeX_debug_flow(struct TeX_Flow *flow, int indent)
{
printf("%*s", indent, "");
if(!flow) { puts("flow (null)"); return; }
printf("flow " GRAY "%dx%d,%d" END "\n", flow->width, flow->height,
flow->line);
struct TeX_Node *node = flow->first;
while(node)
{
TeX_debug_node(node, indent + 4);
node = node->next;
}
}
#endif /* TEX_DEBUG */