%{ #include #include #include #include #include #include /* yylex(): The lexer, as usual */ int yylex(void); /* yyerror(): The error function */ void yyerror(char const *message); //--- // Environment stack management //--- /* Environment stack and current environment's index */ static struct TeX_Env *env_stack[TEX_ENV_DEPTH] = { NULL }; static int env_index = 0; /* Current environment and its type */ #define env (env_stack[env_index]) void env_push(char const *type) { struct TeX_Env *new_env = NULL; if(!strcmp(type, "primary")) new_env = TeX_env_primary(); if(!strcmp(type, "matrix")) new_env = TeX_env_matrix(); /* If new_env is NULL... still insert it. This is more likely to cause an error, and avoids asymmetry with pops */ env_index++; env = new_env; } struct TeX_Env *env_pop(void) { struct TeX_Env *ret = env; env_index--; return ret; } //--- // Extra node allocation functions //--- /* mknode_f(): Make a function node with a flow argument @name Command name; intended for built-in nodes only @flow Flow argument Returns a node for command [name] with as argument [flow]. */ struct TeX_Node *mknode_f(char const *name, struct TeX_Flow *flow) { struct TeX_Node *node = TeX_node_command(name); return TeX_node_add_arg(node, flow); } /* mknode_t(): Make a function node with a text argument */ struct TeX_Node *mknode_t(char const *name, char const *text) { struct TeX_Flow *flow = TeX_flow_add_node(NULL, TeX_node_text(text)); return mknode_f(name, flow); } %} %define api.value.type union /* Basic text and commands */ %token TEXT %token COMMAND %token COMMAND_ABS /* Special tokens for environments */ %token ENV_BEGIN %token ENV_END %left '^' '_' %type flow %type node %type node_abs %type env_node %type env_end %% env: %empty { } | env node { env->add_node(env, $2); } | env '&' { env->add_separator(env); } | env '\\' { env->add_break(env); } flow: %empty { $$ = NULL; } | flow node { $$ = TeX_flow_add_node($1, $2); } node: TEXT { $$ = TeX_node_text($1); } | COMMAND { $$ = TeX_node_command($1); } | node_abs TEXT { $$ = TeX_node_absorb($1, $2); } | node '{' flow '}' { $$ = TeX_node_add_arg($1, $3); } /* Special shortcuts for the superscript and subscript classes */ | '^' TEXT { $$ = mknode_t("\\sup", $2); } | '_' TEXT { $$ = mknode_t("\\sub", $2); } | '^' '{' flow '}' { $$ = mknode_f("\\sup", $3); } | '_' '{' flow '}' { $$ = mknode_f("\\sub", $3); } /* Environments */ | env_node { $$ = $1; } node_abs: COMMAND_ABS { $$ = TeX_node_command($1); } env_node: /* TODO: Add TeX_mknode_env() */ env_begin env env_end { $$ = TeX_node_env($3); } env_begin: ENV_BEGIN '{' TEXT '}' { env_push($3); } env_end: ENV_END '{' TEXT '}' { $$ = env_pop(); } %% //--- // The lexer // // The lexical analysis is actually the subtle part in this program, // because we want to preserve text sections without cutting them to avoid // merging string tokens later on. // // The program below is a lexer with some hybrid parser features that // accomplishes this task. // - It has a lookahead character and never retracts. // - It has a notion of context (notably single-character mode). // This machinery, however, can only be edited by hand. // // The good side of this is that it's much more efficient because we don't // break sections without commands, so there are less allocations and less // tokens to parse. //--- /* Character source (input formula) */ static char const *lex; /* Accumulator. This buffer contains text that will be emitted in the next token, more precisely everything between the start of the token and [lex] (the [lex] pointer will move forward during lexing), but with escape characters decoded. Note that this buffer is static *and* unique, so every token will make its way through the accumulator. As a consequence, we must apply a parsing rule that copies the accumulated string every time we flush the buffer. If a parsing rule has several parameters with data in the accumulator, then all except the last will have their data overridden before the semantic rule is executed. */ static char acc_buffer[TEX_LEXER_BUFSIZE]; /* Position of the next free character in [acc_buffer] */ static char *acc; /* enum state - Lexer automaton states. The automaton state represents what has been discovered at the previous step; the state is changed to store instructions for the next character input round. Often, when a character is read, previously-lexed input is emitted and the character is stored in the accumulator. Possibly the state is changed. */ static enum { /* Reached end-of-file (continuously emits $end tokens) */ eof = 0, /* When reading text sections to be displayed on the screen, possibly with escapes - this is the part we don't want to cut. */ text, /* When reading a command name after a '\' */ command, /* The following states are transitioned into when their characters are read from input. The associated token will only be emitted in the next character input round. */ superscript = '^', subscript = '_', lbrace = '{', rbrace = '}', break_symbol = '\\', separator = '&', } state = text; /* Single-character mode. When a command name, '^' or '_' is not followed by a '{', the argument is taken to be the next input character. This mode alters the behavior of the [text] state (mainly) to emit a token at the next character instead of storing data in the accumulator. */ int single; /* Lookahead symbol. The combination of the lookahead character with the delayed-emission described in [enum state] gives this lexer two characters to make decisions. For example, when lexing "ab^2", at the third character input round: * '^' is read from the input * "ab" is released as a TEXT token and the accumulator is emptied * '2' is seen as lookahead and single-character mode is activated */ static int la; /* forward(): Maybe return Returns the value of @expr if it's nonnegative; does nothing otherwise. */ #define forward(expr) do { \ int v = expr; \ if(v >= 0) return v; \ } while(0) /* lexer_init(): TODO: Document lexer_init() */ void lexer_init(char const *formula) { acc = acc_buffer; /* When the formula is empty, don't let the lexer read a lookahead! */ if(!formula || !formula[0]) { lex = NULL; state = eof; la = 0; single = 0; return; } lex = formula + 1; state = text; la = formula[0]; single = 0; } /* release(): Release the lexer buffer as a textual token This function returns produces a text token of the requested types using the contents of the lexer buffer, but only if the buffer is not empty. Thus the call must be wrapped into forward() and not return if there is a fallback action. @token Requested token type Returns a nonnegative token number if the buffer is not empty, or -1. */ static int release(int token) { if(acc == acc_buffer) return -1; *acc++ = 0; acc = acc_buffer; /* After all we don't need to switch on token */ /* WARN: may be fragile, look for appropriate flags */ yylval.TEXT = yylval.COMMAND = yylval.COMMAND_ABS = acc_buffer; return token; } /* accumulate(): Accumulate characters in the lexer buffer Adds a new character @c to the lexer buffer. If the buffer is full or if single-character mode is active, emits a token and empties the buffer. For this return to work, the call to accumulate() must be wrapped in forward(). @c Character to add (1 byte) Returns a nonnegative token number if one is emitted, -1 otherwise. */ static int accumulate(int c) { *acc++ = c; /* Send a token if the buffer is full or single-character mode is on */ if(acc >= acc_buffer + TEX_LEXER_BUFSIZE - 1 || single) { single = 0; switch(state) { case text: return release(TEXT); case command: return release(COMMAND); default: break; } } /* Continue lexing for now */ return -1; } /* acccmp(): String comparison with the accumulator Like strcmp(), but uses the non-NUL-terminated accumulator as one input. */ static int acccmp(char const *str) { return strncmp(acc_buffer, str, acc - acc_buffer); } /* lexer_text(): Execute a step of lexing from the text state In text state, we are accumulating characters as long as possible without releasing tokens. Longer chunks of text render faster on monochrome calculators and need less memory. This mode is exited whenever a metacharacter, that is '{', '}', '^' or '_'. The backslash character '\' is treated with more care because escaped metacharacters such as '\{' still count as text and don't need us to release the accumulator. Returns a token ID if one is emitted, -1 otherwise. */ static int lexer_text(void) { /* Feed lexer: this is safe thanks because I heed for c = EOF */ int c = la; if(!c) { state = eof; return release(TEXT); } la = *lex++; /* Escapes and command names */ if(c == '\\') { /* Command name: release current buffer and move */ if(la >= 'a' && la <= 'z') { state = command; return release(TEXT); } /* Breaks */ if(la == '\\') { la = *lex++; state = '\\'; return release(TEXT); } /* Escaped character: accumulate lookahead and feed lexer. Feeding is safe because current lookahead is not EOF */ if(strchr("{}^_&", la)) { c = la; la = *lex++; /* Intentional fall-through */ } /* TODO: Emit a warning in an "else" clause? */ return accumulate(c); } /* Opening and closing braces are always syntactic elements */ else if(c == '{' || c == '}' || c == '&') { state = c; return release(TEXT); } /* Superscript and subscript: heed for various input modes */ else if(c == '^' || c == '_') { /* In all cases, prepare to emit c at next lexing round */ state = c; /* If the next character is not '{', then we don't have a {} for the argument; enable single-character mode */ if(la != '{') single = 1; /* Then emit what was already in the buffer, as text */ return release(TEXT); } /* Accumulate the current character in the buffer until it's full */ return accumulate(c); } /* lexer_command(): Execute a step of lexing from the command state This state is transitioned into when a '\' followed by a letter is encountered. The lexer remains there until the end of the command, signaled by a non-letter character, is reached. At this point, the lexer can either enter the text mode again and wait for a '{' to start arguments, or enter the text mode with the single-character argument flag so any character that comes next is treated as the argument. The original TeX does this conditionally, using single-character arguments if and only if the argument does not start with a '{'. However, in this program, because the number of arguments is not known at parse time, this would make dumb commands such as '\alpha' gobble an unbounded amount of arguments. So all commands use brace-only arguments, except for a designated set. Typically this includes \left and \right which are mainly used without braces in practice. Returns a token ID if one is emitted, -1 otherwise. */ static int lexer_command(void) { /* Feed lexer; this is safe because I heed for la = EOF */ int c = la; if(!c) { state = eof; return release(COMMAND); } la = *lex++; /* In this state, c is always in the range a .. z */ int ret = accumulate(c); /* Continue if next character is a command continuation */ if(la >= 'a' && la <= 'z') return ret; /* Otherwise, release command name */ state = text; /* Absorbing commands include "left" and "right" */ if(!acccmp("left") || !acccmp("right")) { single = 1; return release(COMMAND_ABS); } /* Special environment commands */ if(!acccmp("begin")) return release(ENV_BEGIN); if(!acccmp("end")) return release(ENV_END); return release(COMMAND); } /* yylex(): The lexer Returns the token type of the next token in the string initialized by the last call to parse_start(). */ int yylex(void) { while(1) { if(state == text) forward(lexer_text()); else if(state == command) forward(lexer_command()); else break; } /* End-of-File: give up */ if(state == eof) return 0; /* Character-specific states: feed and return state number */ int c = state; state = text; return c; } //--- // Parser interface //--- /* parse(): Parse into a TeX environment */ struct TeX_Env *parse(char const *str) { /* Initialize lexer to run on a string */ lexer_init(str); /* Push a primary environment */ env_push("primary"); int x = yyparse(); /* Pop the primary environment out of stack */ return x ? NULL: env_pop(); } //--- // Printing and errors //--- #ifndef TEX_PRINT void yyerror(TEX_UNUSED char const *error) { } #else #include #include void yyerror(char const *error) { fprintf(stderr, "Parsing failed: %s\n", error); } /* TeX_print_lex(): Print the result of lexing */ void TeX_print_lex(char const *formula) { lexer_init(formula); int token; do { token = yylex(); printf("%-3d ", token); if(token > 0x20 && token < 0x7f) printf("%c", token); if(token == TEXT) printf("TEXT %s", yylval.TEXT); if(token == COMMAND) printf("COMMAND %s", yylval.COMMAND); if(token == COMMAND_ABS) printf("COMMAND_ABS %s", yylval.COMMAND_ABS); if(token == ENV_BEGIN) printf("ENV_BEGIN"); if(token == ENV_END) printf("ENV_END"); printf("\n"); } while(token != 0); } #endif /* TEX_PRINT */