py: Don't automatically intern strings in parser.

This completes non-automatic interning of strings in the parser, so that
doc strings don't take up RAM.  It complicates the parser and compiler,
and bloats stmhal by about 300 bytes.  It's complicated because now
there are 2 kinds of parse-nodes that can be strings: interned leaves
and non-interned structs.
This commit is contained in:
Damien George 2014-05-25 22:06:06 +01:00
parent 3aaabd11a0
commit 5042bce8fb
4 changed files with 122 additions and 71 deletions

View file

@ -56,6 +56,7 @@ typedef enum {
#include "grammar.h"
#undef DEF_RULE
PN_maximum_number_of,
PN_string, // special node for non-interned string
} pn_kind_t;
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
@ -177,6 +178,8 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
}
break;
#endif
case PN_string:
return pn;
}
// fold arguments
@ -426,6 +429,9 @@ void compile_generic_all_nodes(compiler_t *comp, mp_parse_node_struct_t *pns) {
#if MICROPY_EMIT_CPYTHON
STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
return true;
}
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
return false;
}
@ -435,9 +441,7 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
return true;
}
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
uint len;
const byte *str = qstr_data(qstr, &len);
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len, bool bytes) {
bool has_single_quote = false;
bool has_double_quote = false;
for (int i = 0; i < len; i++) {
@ -476,6 +480,12 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
}
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1], false);
return;
}
assert(MP_PARSE_NODE_IS_LEAF(pn));
if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
vstr_printf(vstr, INT_FMT, MP_PARSE_NODE_LEAF_SMALL_INT(pn));
@ -487,8 +497,13 @@ STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vst
case MP_PARSE_NODE_ID: assert(0);
case MP_PARSE_NODE_INTEGER: vstr_printf(vstr, "%s", qstr_str(arg)); break;
case MP_PARSE_NODE_DECIMAL: vstr_printf(vstr, "%s", qstr_str(arg)); break;
case MP_PARSE_NODE_STRING: cpython_c_print_quoted_str(vstr, arg, false); break;
case MP_PARSE_NODE_BYTES: cpython_c_print_quoted_str(vstr, arg, true); break;
case MP_PARSE_NODE_STRING:
case MP_PARSE_NODE_BYTES: {
uint len;
const byte *str = qstr_data(arg, &len);
cpython_c_print_quoted_str(vstr, (const char*)str, len, MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_BYTES);
break;
}
case MP_PARSE_NODE_TOKEN:
switch (arg) {
case MP_TOKEN_KW_FALSE: vstr_printf(vstr, "False"); break;
@ -2058,7 +2073,8 @@ void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
} else {
// for non-REPL, evaluate then discard the expression
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) {
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
// do nothing with a lonely constant
} else {
compile_node(comp, pns->nodes[0]); // just an expression
@ -2498,26 +2514,40 @@ void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
int n_bytes = 0;
int string_kind = MP_PARSE_NODE_NULL;
for (int i = 0; i < n; i++) {
assert(MP_PARSE_NODE_IS_LEAF(pns->nodes[i]));
int pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
int pn_kind;
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
} else {
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
pn_kind = MP_PARSE_NODE_STRING;
n_bytes += (machine_uint_t)pns_string->nodes[1];
}
if (i == 0) {
string_kind = pn_kind;
} else if (pn_kind != string_kind) {
compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
return;
}
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
}
// concatenate string/bytes
byte *q_ptr;
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
for (int i = 0; i < n; i++) {
uint s_len;
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
memcpy(s_dest, s, s_len);
s_dest += s_len;
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
uint s_len;
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
memcpy(s_dest, s, s_len);
s_dest += s_len;
} else {
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
memcpy(s_dest, (const char*)pns_string->nodes[0], (machine_uint_t)pns_string->nodes[1]);
s_dest += (machine_uint_t)pns_string->nodes[1];
}
}
qstr q = qstr_build_end(q_ptr);
@ -2848,15 +2878,19 @@ void compile_node(compiler_t *comp, mp_parse_node_t pn) {
} else {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
EMIT_ARG(set_line_number, pns->source_line);
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
if (f == NULL) {
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
#if MICROPY_DEBUG_PRINTERS
mp_parse_node_print(pn, 0);
#endif
compile_syntax_error(comp, pn, "internal compiler error");
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1]), false);
} else {
f(comp, pns);
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
if (f == NULL) {
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
#if MICROPY_DEBUG_PRINTERS
mp_parse_node_print(pn, 0);
#endif
compile_syntax_error(comp, pn, "internal compiler error");
} else {
f(comp, pns);
}
}
}
}
@ -3033,13 +3067,13 @@ STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) {
// check the first statement for a doc string
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_expr_stmt)) {
mp_parse_node_struct_t* pns = (mp_parse_node_struct_t*)pn;
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0])) {
int kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]);
if (kind == MP_PARSE_NODE_STRING) {
compile_node(comp, pns->nodes[0]); // a doc string
// store doc string
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0])
&& MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]) == MP_PARSE_NODE_STRING)
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
// compile the doc string
compile_node(comp, pns->nodes[0]);
// store the doc string
EMIT_ARG(store_id, MP_QSTR___doc__);
}
}
}
#endif

View file

@ -66,6 +66,11 @@
#define MICROPY_ALLOC_PARSE_RESULT_INC (16)
#endif
// Strings this length or less will be interned by the parser
#ifndef MICROPY_ALLOC_PARSE_INTERN_STRING_LEN
#define MICROPY_ALLOC_PARSE_INTERN_STRING_LEN (10)
#endif
// Initial amount for ids in a scope
#ifndef MICROPY_ALLOC_SCOPE_ID_INIT
#define MICROPY_ALLOC_SCOPE_ID_INIT (4)

View file

@ -28,7 +28,7 @@
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include "misc.h"
#include "mpconfig.h"
@ -71,7 +71,7 @@ enum {
#include "grammar.h"
#undef DEF_RULE
RULE_maximum_number_of,
RULE_string,
RULE_string, // special node for non-interned string
};
#define or(n) (RULE_ACT_OR | n)
@ -172,26 +172,26 @@ mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg) {
return (mp_parse_node_t)(kind | (arg << 5));
}
uint mp_parse_node_free(mp_parse_node_t pn) {
uint cnt = 0;
void mp_parse_node_free(mp_parse_node_t pn) {
if (MP_PARSE_NODE_IS_STRUCT(pn)) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
uint rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
if (rule_id == RULE_string) {
return;
}
bool adjust = ADD_BLANK_NODE(rule_id);
if (adjust) {
n--;
}
for (uint i = 0; i < n; i++) {
cnt += mp_parse_node_free(pns->nodes[i]);
mp_parse_node_free(pns->nodes[i]);
}
if (adjust) {
n++;
}
m_del_var(mp_parse_node_struct_t, mp_parse_node_t, n, pns);
cnt++;
}
return cnt;
}
#if MICROPY_DEBUG_PRINTERS
@ -220,19 +220,21 @@ void mp_parse_node_print(mp_parse_node_t pn, int indent) {
case MP_PARSE_NODE_TOKEN: printf("tok(" INT_FMT ")\n", arg); break;
default: assert(0);
}
} else if (MP_PARSE_NODE_IS_STRING(pn)) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
} else {
// node must be a mp_parse_node_struct_t
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
} else {
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
#ifdef USE_RULE_NAME
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
#else
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
#endif
for (uint i = 0; i < n; i++) {
mp_parse_node_print(pns->nodes[i], indent + 2);
for (uint i = 0; i < n; i++) {
mp_parse_node_print(pns->nodes[i], indent + 2);
}
}
}
}
@ -279,7 +281,20 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
parser->result_stack[parser->result_stack_top++] = pn;
}
STATIC void push_string(parser_t *parser, int src_line, const char *str, uint len);
STATIC void push_result_string(parser_t *parser, int src_line, const char *str, uint len) {
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
if (pn == NULL) {
memory_error(parser);
return;
}
pn->source_line = src_line;
pn->kind_num_nodes = RULE_string | (2 << 8);
char *p = m_new(char, len);
memcpy(p, str, len);
pn->nodes[0] = (machine_int_t)p;
pn->nodes[1] = len;
push_result_node(parser, (mp_parse_node_t)pn);
}
STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
const mp_token_t *tok = mp_lexer_cur(lex);
@ -326,10 +341,24 @@ STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
}
} else if (tok->kind == MP_TOKEN_STRING) {
printf("Pushing string\n");
push_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
return;
// pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qstr_from_strn(tok->str, tok->len));
// Don't automatically intern all strings. doc strings (which are usually large)
// will be discarded by the compiler, and so we shouldn't intern them.
qstr qst = MP_QSTR_NULL;
if (tok->len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
// intern short strings
qst = qstr_from_strn(tok->str, tok->len);
} else {
// check if this string is already interned
qst = qstr_find_strn((const byte*)tok->str, tok->len);
}
if (qst != MP_QSTR_NULL) {
// qstr exists, make a leaf node
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
} else {
// not interned, make a node holding a pointer to the string data
push_result_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
return;
}
} else if (tok->kind == MP_TOKEN_BYTES) {
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(tok->str, tok->len));
} else {
@ -338,21 +367,6 @@ printf("Pushing string\n");
push_result_node(parser, pn);
}
STATIC void push_string(parser_t *parser, int src_line, const char *str, uint len) {
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
if (pn == NULL) {
memory_error(parser);
return;
}
pn->source_line = src_line;
pn->kind_num_nodes = RULE_string | (2 << 8);
char *p = m_new(char, len);
memcpy(p, str, len);
pn->nodes[0] = (machine_int_t)p;
pn->nodes[1] = len;
push_result_node(parser, (mp_parse_node_t)pn);
}
STATIC void push_result_rule(parser_t *parser, int src_line, const rule_t *rule, int num_args) {
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, num_args);
if (pn == NULL) {
@ -541,14 +555,13 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
}
}
#if 1 && !MICROPY_ENABLE_DOC_STRING
// this code discards lonely statement, such as doc strings
// problem is that doc strings have already been interned, so this doesn't really help reduce RAM usage
#if !MICROPY_EMIT_CPYTHON && !MICROPY_ENABLE_DOC_STRING
// this code discards lonely statements, such as doc strings
if (input_kind != MP_PARSE_SINGLE_INPUT && rule->rule_id == RULE_expr_stmt && peek_result(&parser, 0) == MP_PARSE_NODE_NULL) {
mp_parse_node_t p = peek_result(&parser, 1);
if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRING(p)) {
pop_result(parser);
pop_result(parser);
if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRUCT_KIND(p, RULE_string)) {
pop_result(&parser);
pop_result(&parser);
push_result_rule(&parser, rule_src_line, rules[RULE_pass_stmt], 0);
break;
}

View file

@ -80,10 +80,9 @@ typedef struct _mp_parse_node_struct_t {
#define MP_PARSE_NODE_LEAF_SMALL_INT(pn) (((machine_int_t)(pn)) >> 1)
#define MP_PARSE_NODE_STRUCT_KIND(pns) ((pns)->kind_num_nodes & 0xff)
#define MP_PARSE_NODE_STRUCT_NUM_NODES(pns) ((pns)->kind_num_nodes >> 8)
#define MP_PARSE_NODE_IS_STRING(pn) (MP_PARSE_NODE_STRUCT_KIND((mp_parse_node_struct_t*)pn) == RULE_string)
mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg);
uint mp_parse_node_free(mp_parse_node_t pn);
void mp_parse_node_free(mp_parse_node_t pn);
void mp_parse_node_print(mp_parse_node_t pn, int indent);