TeX/src/TeX.y

%{

#include <TeX/config.h>
#include <TeX/node.h>
#include <TeX/flow.h>
#include <TeX/env.h>

#include <stdlib.h>
#include <string.h>

/* yylex(): The lexer, as usual */
int yylex(void);
/* yyerror(): The error function */
void yyerror(char const *message);

//---
//	Environment stack management
//---

/* Environment stack and current environment's index */
static struct TeX_Env *env_stack[TEX_ENV_DEPTH] = { NULL };
static int env_index = 0;

/* Current environment and its type */
#define env (env_stack[env_index])

void env_push(char const *type)
{
	struct TeX_Env *new_env = NULL;

	if(!strcmp(type, "primary"))
		new_env = TeX_env_primary();
	if(!strcmp(type, "matrix"))
		new_env = TeX_env_matrix();

	/* If new_env is NULL... still insert it. This is more likely to cause
	   an error, and avoids asymmetry with pops */

	env_index++;
	env = new_env;
}

struct TeX_Env *env_pop(void)
{
	struct TeX_Env *ret = env;
	env_index--;
	return ret;
}

//---
//	Extra node allocation functions
//---

/* mknode_f(): Make a function node with a flow argument
   @name  Command name; intended for built-in nodes only
   @flow  Flow argument
   Returns a node for command [name] with as argument [flow]. */
struct TeX_Node *mknode_f(char const *name, struct TeX_Flow *flow)
{
	struct TeX_Node *node = TeX_node_command(name);
	return TeX_node_add_arg(node, flow);
}

/* mknode_t(): Make a function node with a text argument */
struct TeX_Node *mknode_t(char const *name, char const *text)
{
	struct TeX_Flow *flow = TeX_flow_add_node(NULL, TeX_node_text(text));
	return mknode_f(name, flow);
}

%}

%define api.value.type union

/* Basic text and commands */
%token <char const *> TEXT
%token <char const *> COMMAND
%token <char const *> COMMAND_ABS

/* Special tokens for environments */
%token ENV_BEGIN
%token ENV_END

%left '^' '_'

%type <struct TeX_Flow *> flow
%type <struct TeX_Node *> node
%type <struct TeX_Node *> node_abs
%type <struct TeX_Node *> env_node
%type <struct TeX_Env *> env_end

%%

env:
	%empty			{ }
	| env node		{ env->add_node(env, $2); }
	| env '&'		{ env->add_separator(env); }
	| env '\\'		{ env->add_break(env); }

flow:
	%empty			{ $$ = NULL; }
	| flow node		{ $$ = TeX_flow_add_node($1, $2); }

node:
	TEXT			{ $$ = TeX_node_text($1); }
	| COMMAND		{ $$ = TeX_node_command($1); }
	| node_abs TEXT		{ $$ = TeX_node_absorb($1, $2); }
	| node '{' flow '}'	{ $$ = TeX_node_add_arg($1, $3); }
	/* Special shortcuts for the superscript and subscript classes */
	| '^' TEXT		{ $$ = mknode_t("\\sup", $2); }
	| '_' TEXT		{ $$ = mknode_t("\\sub", $2); }
	| '^' '{' flow '}'	{ $$ = mknode_f("\\sup", $3); }
	| '_' '{' flow '}'	{ $$ = mknode_f("\\sub", $3); }
	/* Environments */
	| env_node		{ $$ = $1; }

node_abs:
	COMMAND_ABS		{ $$ = TeX_node_command($1); }

env_node:
	/* TODO: Add TeX_mknode_env() */
	env_begin env env_end	{ $$ = TeX_node_env($3); }

env_begin:
	ENV_BEGIN '{' TEXT '}'  { env_push($3); }
env_end:
	ENV_END '{' TEXT '}'    { $$ = env_pop(); }

%%

//---
//	The lexer
//
//	The lexical analysis is actually the subtle part in this program,
//	because we want to preserve text sections without cutting them to avoid
//	merging string tokens later on.
//
//	The program below is a lexer with some hybrid parser features that
//	accomplishes this task.
//	- It has a lookahead character and never retracts.
//	- It has a notion of context (notably single-character mode).
//	This machinery, however, can only be edited by hand.
//
//	The good side of this is that it's much more efficient because we don't
//	break sections without commands, so there are less allocations and less
//	tokens to parse.
//---

/* Character source (input formula) */
static char const *lex;

/* Accumulator. This buffer contains text that will be emitted in the next
   token, more precisely everything between the start of the token and [lex]
   (the [lex] pointer will move forward during lexing), but with escape
   characters decoded.

   Note that this buffer is static *and* unique, so every token will make its
   way through the accumulator. As a consequence, we must apply a parsing rule
   that copies the accumulated string every time we flush the buffer. If a
   parsing rule has several parameters with data in the accumulator, then all
   except the last will have their data overridden before the semantic rule is
   executed. */
static char acc_buffer[TEX_LEXER_BUFSIZE];

/* Position of the next free character in [acc_buffer] */
static char *acc;

/* enum state - Lexer automaton states.
   The automaton state represents what has been discovered at the previous
   step; the state is changed to store instructions for the next character
   input round. Often, when a character is read, previously-lexed input is
   emitted and the character is stored in the accumulator. Possibly the state
   is changed. */
static enum {
	/* Reached end-of-file (continuously emits $end tokens) */
	eof = 0,
	/* When reading text sections to be displayed on the screen, possibly
	   with escapes - this is the part we don't want to cut. */
	text,
	/* When reading a command name after a '\' */
	command,
	/* The following states are transitioned into when their characters are
	   read from input. The associated token will only be emitted in the
	   next character input round. */
	superscript	= '^',
	subscript	= '_',
	lbrace		= '{',
	rbrace		= '}',
	break_symbol	= '\\',
	separator	= '&',
} state = text;

/* Single-character mode. When a command name, '^' or '_' is not followed by a
   '{', the argument is taken to be the next input character. This mode alters
   the behavior of the [text] state (mainly) to emit a token at the next
   character instead of storing data in the accumulator. */
int single;
/* Lookahead symbol. The combination of the lookahead character with the
   delayed-emission described in [enum state] gives this lexer two characters
   to make decisions. For example, when lexing "ab^2", at the third character
   input round:
   * '^' is read from the input
   * "ab" is released as a TEXT token and the accumulator is emptied
   * '2' is seen as lookahead and single-character mode is activated */
static int la;

/* forward(): Maybe return
   Returns the value of @expr if it's nonnegative; does nothing otherwise. */
#define forward(expr) do {		\
	int v = expr;			\
	if(v >= 0) return v;		\
} while(0)

/* lexer_init(): TODO: Document lexer_init() */
void lexer_init(char const *formula)
{
	acc = acc_buffer;

	/* When the formula is empty, don't let the lexer read a lookahead! */
	if(!formula || !formula[0])
	{
		lex = NULL;
		state = eof;
		la = 0;
		single = 0;
		return;
	}

	lex = formula + 1;
	state = text;
	la = formula[0];
	single = 0;
}

/* release(): Release the lexer buffer as a textual token
   This function returns produces a text token of the requested types using the
   contents of the lexer buffer, but only if the buffer is not empty. Thus the
   call must be wrapped into forward() and not return if there is a fallback
   action.
   @token  Requested token type
   Returns a nonnegative token number if the buffer is not empty, or -1. */
static int release(int token)
{
	if(acc == acc_buffer) return -1;

	*acc++ = 0;
	acc = acc_buffer;

	/* After all we don't need to switch on token */
	/* WARN: may be fragile, look for appropriate flags */
	yylval.TEXT = yylval.COMMAND = yylval.COMMAND_ABS = acc_buffer;
	return token;
}

/* accumulate(): Accumulate characters in the lexer buffer
   Adds a new character @c to the lexer buffer. If the buffer is full or if
   single-character mode is active, emits a token and empties the buffer. For
   this return to work, the call to accumulate() must be wrapped in forward().
   @c  Character to add (1 byte)
   Returns a nonnegative token number if one is emitted, -1 otherwise. */
static int accumulate(int c)
{
	*acc++ = c;

	/* Send a token if the buffer is full or single-character mode is on */
	if(acc >= acc_buffer + TEX_LEXER_BUFSIZE - 1 || single)
	{
		single = 0;

		switch(state)
		{
		case text:	return release(TEXT);
		case command:	return release(COMMAND);
		default:	break;
		}
	}

	/* Continue lexing for now */
	return -1;
}

/* acccmp(): String comparison with the accumulator
   Like strcmp(), but uses the non-NUL-terminated accumulator as one input. */
static int acccmp(char const *str)
{
	return strncmp(acc_buffer, str, acc - acc_buffer);
}

/* lexer_text(): Execute a step of lexing from the text state

   In text state, we are accumulating characters as long as possible without
   releasing tokens. Longer chunks of text render faster on monochrome
   calculators and need less memory.

   This mode is exited whenever a metacharacter, that is '{', '}', '^' or '_'.
   The backslash character '\' is treated with more care because escaped
   metacharacters such as '\{' still count as text and don't need us to release
   the accumulator.

   Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_text(void)
{
	/* Feed lexer: this is safe thanks because I heed for c = EOF */
	int c = la;
	if(!c) { state = eof; return release(TEXT); }
	la = *lex++;

	/* Escapes and command names */
	if(c == '\\')
	{
		/* Command name: release current buffer and move */
		if(la >= 'a' && la <= 'z')
		{
			state = command;
			return release(TEXT);
		}

		/* Breaks */
		if(la == '\\')
		{
			la = *lex++;
			state = '\\';
			return release(TEXT);
		}

		/* Escaped character: accumulate lookahead and feed lexer.
		   Feeding is safe because current lookahead is not EOF */
		if(strchr("{}^_&", la))
		{
			c = la;
			la = *lex++;

			/* Intentional fall-through */
		}
		/* TODO: Emit a warning in an "else" clause? */

		return accumulate(c);
	}

	/* Opening and closing braces are always syntactic elements */
	else if(c == '{' || c == '}' || c == '&')
	{
		state = c;
		return release(TEXT);
	}

	/* Superscript and subscript: heed for various input modes */
	else if(c == '^' || c == '_')
	{
		/* In all cases, prepare to emit c at next lexing round */
		state = c;

		/* If the next character is not '{', then we don't have a {}
		   for the argument; enable single-character mode */
		if(la != '{') single = 1;

		/* Then emit what was already in the buffer, as text */
		return release(TEXT);
	}

	/* Accumulate the current character in the buffer until it's full */
	return accumulate(c);
}

/* lexer_command(): Execute a step of lexing from the command state

   This state is transitioned into when a '\' followed by a letter is
   encountered. The lexer remains there until the end of the command, signaled
   by a non-letter character, is reached.

   At this point, the lexer can either enter the text mode again and wait for a
   '{' to start arguments, or enter the text mode with the single-character
   argument flag so any character that comes next is treated as the argument.

   The original TeX does this conditionally, using single-character arguments
   if and only if the argument does not start with a '{'. However, in this
   program, because the number of arguments is not known at parse time, this
   would make dumb commands such as '\alpha' gobble an unbounded amount of
   arguments. So all commands use brace-only arguments, except for a designated
   set. Typically this includes \left and \right which are mainly used without
   braces in practice.

   Returns a token ID if one is emitted, -1 otherwise. */
static int lexer_command(void)
{
	/* Feed lexer; this is safe because I heed for la = EOF */
	int c = la;
	if(!c) { state = eof; return release(COMMAND); }
	la = *lex++;

	/* In this state, c is always in the range a .. z */
	int ret = accumulate(c);

	/* Continue if next character is a command continuation */
	if(la >= 'a' && la <= 'z') return ret;

	/* Otherwise, release command name */
	state = text;

	/* Absorbing commands include "left" and "right" */
	if(!acccmp("left") || !acccmp("right"))
	{
		single = 1;
		return release(COMMAND_ABS);
	}

	/* Special environment commands */
	if(!acccmp("begin"))
		return release(ENV_BEGIN);
	if(!acccmp("end"))
		return release(ENV_END);

	return release(COMMAND);
}

/* yylex(): The lexer
   Returns the token type of the next token in the string initialized by the
   last call to parse_start(). */
int yylex(void)
{
	while(1)
	{
		if(state == text) forward(lexer_text());
		else if(state == command) forward(lexer_command());
		else break;
	}

	/* End-of-File: give up */
	if(state == eof) return 0;

	/* Character-specific states: feed and return state number */
	int c = state;
	state = text;
	return c;
}

//---
//	Parser interface
//---

/* parse(): Parse into a TeX environment */
struct TeX_Env *parse(char const *str)
{
	/* Initialize lexer to run on a string */
	lexer_init(str);

	/* Push a primary environment */
	env_push("primary");

	int x = yyparse();

	/* Pop the primary environment out of stack */
	return x ? NULL: env_pop();
}

//---
//	Printing and errors
//---

#ifndef TEX_PRINT

void yyerror(TEX_UNUSED char const *error)
{
}

#else

#include <stdio.h>
#include <TeX/print.h>

void yyerror(char const *error)
{
	fprintf(stderr, "Parsing failed: %s\n", error);
}

/* TeX_print_lex(): Print the result of lexing */
void TeX_print_lex(char const *formula)
{
	lexer_init(formula);
	int token;

	do
	{
		token = yylex();
		printf("%-3d ", token);

		if(token > 0x20 && token < 0x7f)
			printf("%c", token);
		if(token == TEXT)
			printf("TEXT        %s", yylval.TEXT);
		if(token == COMMAND)
			printf("COMMAND     %s", yylval.COMMAND);
		if(token == COMMAND_ABS)
			printf("COMMAND_ABS %s", yylval.COMMAND_ABS);
		if(token == ENV_BEGIN)
			printf("ENV_BEGIN");
		if(token == ENV_END)
			printf("ENV_END");

		printf("\n");
	}
	while(token != 0);
}

#endif /* TEX_PRINT */