词法分析器+行语法分析器
主程序 (C/C++)
#include "source.cpp"
随机测试文本产生程序 (C)
#include <stdio.h>
int main() {
printf("\
let x = 10\n\
\n\
def fun1()\n\
let x = 0.5\n\
for i in 10\n\
fun2(2,i,7)\n\
x = x + i\n\
while x > 10\n\
x = x // 2\n\
return x\n\
\n\
# comment\n\
def fun2(x, y, z)\n\
if (x < y) and (not (z < y))\n\
print(x,y,z,'in order')\n\
else\n\
print(x,y,z,\"out of order\")\n\
\n\
print(fun1())\n");
}
答案 (C/C++与主程序相同)
// util
#include <string>
#include <vector>
#include <iostream>
#include <sstream>
using namespace std;
#define I(i,n) for (int i = 0; i < (n); ++ i)
template <typename E>
struct _Node
{
E elem;
_Node * next;
_Node * prev;
};
class NotSuchElementException {};
template <typename E>
class LinkedList
{
int _size;
_Node<E> * head;
_Node<E> * tail;
public:
LinkedList() : _size(0) {}
int size() const { return _size; }
E & getFirst() const {
if (_size == 0) throw NotSuchElementException();
return head->elem;
}
E & getLast() const {
if (_size == 0) throw NotSuchElementException();
return tail->elem;
}
E removeFirst() {
if (_size == 0) throw NotSuchElementException();
_Node<E> * tmp = head;
head = head->next;
-- _size;
E e = tmp->elem;
delete tmp;
return e;
}
E removeLast() {
if (_size == 0) throw NotSuchElementException();
_Node<E> * tmp = tail;
tail = tail->prev;
-- _size;
E e = tmp->elem;
delete tmp;
return e;
}
void addFirst(const E & elem) {
_Node<E> * tmp = new _Node<E>;
tmp->elem = elem;
tmp->next = head;
if (_size > 0) head->prev = tmp;
head = tmp;
if (_size == 0) tail = tmp;
++ _size;
}
void addLast(const E & elem) {
_Node<E> * tmp = new _Node<E>;
tmp->elem = elem;
tmp->prev = tail;
if (_size > 0) tail->next = tmp;
tail = tmp;
if (_size == 0) head = tmp;
++ _size;
}
void clear() {
while (_size > 0) removeFirst();
}
~LinkedList() {
clear();
}
operator vector<E>() const {
vector<E> vec;
_Node<E> * tmp = head;
I(i, _size) {
vec.push_back(tmp->elem);
tmp = tmp->next;
}
return vec;
}
void copy_list(const vector<E> & vec) {
clear();
I(i, vec.size()) {
addLast(vec[i]);
}
}
LinkedList(const vector<E> & vec) : _size(0) {
copy_list(vec);
}
LinkedList(const LinkedList & list) : _size(0) {
copy_list(vector<E>(list));
}
LinkedList & operator = (const LinkedList & list) {
copy_list(vector<E>(list));
return *this;
}
};
class NoSuchKeyException {};
template <typename I>
I circular_shift(I bits, int shift_bits) {
I num_bits = 8 * sizeof(I);
shift_bits = shift_bits % num_bits;
return bits << shift_bits | bits >> (num_bits - shift_bits);
}
int operator%(const string & key, int mod) {
size_t code = 0;
I(i, key.size())
code ^= circular_shift(key[i], i * 5);
return code % mod;
}
// 如果需要把类型 K 作为键,需要有:
// 1) int operater%(K, int)
// 2) bool K::operater==(K)
template <typename K, typename T>
class _Hash
{
vector<T> tuples;
size_t _size;
size_t canonical_index(const K & key) const {
return key % tuples.size();
}
// the returned tuple is either empty or with the same key
size_t indexOfKey(const K & key) const {
size_t index = canonical_index(key);
while (true) { // will enter a dead loop if there is not empty tuple
if (! tuples[index].in_use) return index;
if (tuples[index].key == key) return index;
index = (index + 1) % tuples.size();
}
}
public:
_Hash() : _size(0) {
tuples.resize(2);
}
bool hasKey(const K & key) const {
size_t index = indexOfKey(key);
return tuples[index].in_use;
}
T & get(const K & key) {
size_t index = indexOfKey(key);
if (! tuples[index].in_use) throw NoSuchKeyException();
return tuples[index];
}
const T & get(const K & key) const {
size_t index = indexOfKey(key);
if (! tuples[index].in_use) throw NoSuchKeyException();
return tuples[index];
}
void put(const T & tuple) {
size_t index = indexOfKey(tuple.key);
bool in_use = tuples[index].in_use;
tuples[index] = tuple;
tuples[index].in_use = true;
if (in_use) return;
++ _size;
// make sure empty > size / 2
if (2 * _size > tuples.size())
_double_tuples();
}
void _double_tuples() {
vector<T> non_empty_tuples;
I(i, tuples.size())
if (tuples[i].in_use)
non_empty_tuples.push_back(tuples[i]);
tuples.resize(tuples.size() * 2);
clear();
I(i, non_empty_tuples.size())
put(non_empty_tuples[i]);
}
static bool _between(size_t hole, size_t canonical, size_t tuple_index) {
if (hole < tuple_index)
return canonical > hole && canonical <= tuple_index;
else
return canonical > hole || canonical <= tuple_index;
}
void remove(const K & key) {
size_t index = indexOfKey(key);
if (! tuples[index].in_use)
throw NoSuchKeyException();
tuples[index].in_use = false;
-- _size;
// if there is a hole between a tuple and its canonical position
// move the tuple to the hole
size_t hole = index;
size_t tuple_index = hole;
while (true) {
tuple_index = (tuple_index + 1) % tuples.size();
if (! tuples[tuple_index].in_use) return;
size_t canonical = canonical_index(tuples[tuple_index].key);
if (_between(hole, canonical, tuple_index)) continue;
tuples[hole] = tuples[tuple_index];
hole = tuple_index;
tuples[hole].in_use = false;
}
}
size_t size() const { return _size; }
void clear() {
I(i, tuples.size())
tuples[i].in_use = false;
_size = 0;
}
vector<K> keys() const {
vector<K> ks;
I(i, tuples.size())
if (tuples[i].in_use)
ks.push_back(tuples[i].key);
return ks;
}
};
template <typename K, typename V>
class HashTable
{
class _Tuple
{
public:
K key;
V val;
bool in_use;
_Tuple() : in_use(false) {}
_Tuple(const K & k, const V & v) : in_use(false), key(k), val(v) {}
};
_Hash<K,_Tuple> _hash;
public:
bool hasKey(const K & key) const {
return _hash.hasKey(key);
}
size_t size() const { return _hash.size(); }
void clear() {
_hash.clear();
}
const V & get(const K & k) const {
return _hash.get(k).val;
}
V & get(const K & k) {
return _hash.get(k).val;
}
void put(const K & k, const V & v) {
_hash.put(_Tuple(k, v));
}
void remove(const K & k) {
_hash.remove(k);
}
vector<K> keys() const {
return _hash.keys();
}
};
template <typename E>
class HashSet
{
class _Tuple
{
public:
E key;
bool in_use;
_Tuple() : in_use(false) {}
_Tuple(const E & e) : in_use(false), key(e) {}
};
_Hash<E,_Tuple> _hash;
public:
bool has(const E & e) const {
return _hash.hasKey(e);
}
size_t size() const { return _hash.size(); }
void clear() {
_hash.clear();
}
void add(const E & e) {
_hash.put(_Tuple(e));
}
void remove(const E & e) {
_hash.remove(e);
}
operator vector<E>() const {
return _hash.keys();
}
};
// parse
class CompilationError : public runtime_error
{
public:
CompilationError(const char err_msg[]) : runtime_error(err_msg) {}
};
string _normalize_braket(string text) {
string text2 = text;
I(i, text2.size())
switch (text[i]) {
case '(': text2[i] = '['; break;
case ')': text2[i] = ']'; break;
}
return text2;
}
class Span
{
public:
int line_no;
int prev_end;
int char_start;
int char_end;
string type;
string text;
string key;
vector<Span *> children;
Span() {}
Span(int line_no, int prev_end, int char_start, int char_end,
const string & type, const string & text) {
this->line_no = line_no;
this->prev_end = prev_end;
this->char_start = char_start;
this->char_end = char_end;
this->type = type;
this->text = text;
stringstream ss;
ss<<type<<"("<<char_start<<","<<char_end<<")";
this->key = ss.str();
}
string linearize() const {
stringstream ss;
ss<<"("<<_normalize_braket(type);
if (children.size() == 0)
ss<<" "<<_normalize_braket(text);
else {
I(i, children.size())
ss<<" "<<children[i]->linearize();
}
ss<<")";
return ss.str();
}
};
typedef int (* matcher)(const string & line, int start);
class _Matcher
{
public:
string type;
virtual int match(const string & line, int start) const = 0;
};
class TextMatcher : public _Matcher
{
string text;
public:
TextMatcher(const string & type, const string & text) {
this->type = type;
this->text = text;
}
int match(const string & line, int start) const {
string word = line.substr(start, this->text.size());
return (word == this->text) ? this->text.size() : 0;
}
};
class SymbolMatcher : public _Matcher
{
matcher func;
public:
SymbolMatcher(const string & type, matcher func) {
this->type = type;
this->func = func;
}
int match(const string & line, int start) const {
return this->func(line, start);
}
};
class LineLex
{
public:
vector<_Matcher *> matchers;
void match(const string & line, int line_no, vector<Span> & spans) {
int prev_end = 0;
for (int i = 0; i < line.size(); ++ i) {
if (line[i] == ' ' || line[i] == '\t') continue;
int max_match_size = -1;
string match_type;
for (int j = 0; j < this->matchers.size(); ++ j) {
int match_size = this->matchers[j]->match(line, i);
if (match_size > 0 && match_size > max_match_size) {
max_match_size = match_size;
match_type = this->matchers[j]->type;
}
}
if (max_match_size == -1) {
stringstream ss;
ss<<"Lexical error: "<<line.substr(i)<<" line# "<<(line_no+1)<<" char#"<<(i+1);
throw CompilationError(ss.str().c_str());
}
spans.push_back(Span(line_no, prev_end, i, i+max_match_size,
match_type, line.substr(i, max_match_size)));
prev_end = i+max_match_size;
i += max_match_size - 1;
}
}
};
typedef void (* line_action_func)(Span & span);
class Rule
{
public:
line_action_func action;
string left;
vector<string> right;
string key;
Rule() {}
Rule(line_action_func action, const string & left, const string & right1,
const string & right2="", const string & right3="",
const string & right4="", const string & right5="") {
this->action = action;
this->left = left;
this->right.push_back(right1);
if (right2 != "") this->right.push_back(right2);
if (right3 != "") this->right.push_back(right3);
if (right4 != "") this->right.push_back(right4);
if (right5 != "") this->right.push_back(right5);
stringstream ss;
ss <<this->left<<" ->";
I(i, right.size())
ss<<" "<< right[i];
this->key = ss.str();
}
};
class LineTree
{
HashSet<string> span_set;
vector<Span *> all_spans;
HashTable<string, vector<Rule *> > rule_end_at;
HashTable<int, vector<Span *> > spans_end_at;
LinkedList<Span *> to_reduce;
bool _inited;
public:
HashTable<string, Rule> rules;
LineTree() {
_reset();
_inited = false;
}
~LineTree() {
_reset();
}
void add_rule(const Rule & rule) {
if (_inited) throw CompilationError("LineTree is inited and fixed");
rules.put(rule.key, rule);
}
private:
void _init() {
if (_inited) return;
_inited = true;
vector<string> rule_keys = rules.keys();
I(i, rule_keys.size()) {
string key = rule_keys[i];
Rule & rule = rules.get(key);
string type = rule.right[rule.right.size() - 1];
if (! rule_end_at.hasKey(type))
rule_end_at.put(type, vector<Rule *>());
vector<Rule *> & rule_end_at1 = rule_end_at.get(type);
rule_end_at1.push_back(&rule);
}
}
void _reset() {
I(i, all_spans.size())
delete all_spans[i];
all_spans.clear();
span_set.clear();
spans_end_at.clear();
to_reduce.clear();
}
void _try_match_rule(const Rule & rule, Span & span, LinkedList<Span *> matched_list) {
matched_list.addFirst(&span);
if (matched_list.size() == rule.right.size()) {
Span & end = *matched_list.getLast();
Span * matched = new Span(span.line_no,
span.prev_end,span.char_start,end.char_end,rule.left,rule.key);
matched->children = matched_list;
all_spans.push_back(matched);
if (span_set.has(matched->key))
_report_ambiguity_error(*matched);
span_set.add(matched->key);
to_reduce.addLast(matched);
spans_end_at.get(matched->char_end).push_back(matched);
}
else {
string prev_type = rule.right[rule.right.size()-1-matched_list.size()];
if (spans_end_at.hasKey(span.prev_end)) {
vector<Span *> & span_end_at1 = spans_end_at.get(span.prev_end);
I(i, span_end_at1.size())
if (span_end_at1[i]->type == prev_type)
_try_match_rule(rule, *span_end_at1[i], matched_list);
}
}
}
void _add_span(Span & span) {
to_reduce.addLast(&span);
vector<Span *> span_end_at;
span_end_at.push_back(&span);
spans_end_at.put(span.char_end, span_end_at);
while (to_reduce.size() > 0) {
Span * to_reduce1 = to_reduce.removeFirst();
if (! rule_end_at.hasKey(to_reduce1->type)) continue;
vector<Rule *> & rules = rule_end_at.get(to_reduce1->type);
LinkedList<Span *> matched_list;
I(i, rules.size())
_try_match_rule(*rules[i], *to_reduce1, matched_list);
}
}
Span _get_result(int end, vector<Span> & spans) {
vector<Span *> & span_end_at1 = spans_end_at.get(end);
Span * res = 0;
I(i, span_end_at1.size())
if (span_end_at1[i]->prev_end == 0 && span_end_at1[i]->type[0] == '#') {
if (res == 0) res = span_end_at1[i];
else _report_ambiguity_error(*res);
}
if (res == 0)
_report_syntax_error(spans);
return *res;
}
void _report_ambiguity_error(Span & matched) {
stringstream ss;
ss<<"Ambiguous span "<<matched.key<< " line:"<<(matched.line_no+1);
throw CompilationError(ss.str().c_str());
}
void _report_syntax_error(vector<Span> & spans) {
stringstream ss;
ss<<"Syntax error at line:"<<(spans[0].line_no+1);
throw CompilationError(ss.str().c_str());
}
public:
Span reduce(vector<Span> & spans) {
_init();
_reset();
I(i, spans.size()) {
_add_span(spans[i]);
}
return _get_result(spans[spans.size()-1].char_end, spans);
}
};
// line
//+ $int
int match_lex_int(const string & line, int start) {
for (int i = start; i < line.size(); ++ i) {
if (line[i] >= '0' && line[i] <= '9') continue;
return i - start;
}
return line.size() - start;
}
//+ $float
int match_lex_float(const string & line, int start) {
int num_dot = 0;
for (int i = start; i < line.size(); ++ i) {
if (line[i] == '.') {
num_dot += 1;
continue;
}
if (line[i] >= '0' && line[i] <= '9') continue;
return i - start;
}
if (num_dot != 1) return 0;
return line.size() - start;
}
const char * op[] = {"and", "not", "or", "xor", "+", "-", "*", "/", "//",
"~", "%", "^", "&", "|", ">", "<", ">=", "<=", "==", "!="};
bool op_has(const string & text) {
I(i, sizeof(op)/sizeof(char *))
if (text == op[i]) return true;
return false;
}
//+ $op
int match_lex_op(const string & line, int start) {
if (op_has(line.substr(start, 3))) return 3;
if (op_has(line.substr(start, 2))) return 2;
if (op_has(line.substr(start, 1))) return 1;
return 0;
}
//+ $id
int match_lex_id(const string & line, int start) {
for (int i = start; i < line.size(); ++ i) {
if (i != start && line[i] >= '0' && line[i] <= '9') continue;
if (line[i] == '_') continue;
if (line[i] >= 'a' && line[i] <= 'z') continue;
if (line[i] >= 'A' && line[i] <= 'Z') continue;
return i - start;
}
return line.size() - start;
}
//+ $text
int match_lex_text(const string & line, int start) {
char delimitor = ' ';
for (int i = start; i < line.size(); ++ i) {
if (i == start) {
if (line[i] == '"' || line[i] == '\'') {
delimitor = line[i];
continue;
}
}
else {
if (line[i] == delimitor) {
return i + 1 - start;
}
if (line[i] == '\\') {
if (i + 1 == line.size()) return 0;
if (line[i + 1] == '\\' || line[i + 1] == delimitor ||
line[i + 1] == 'n' || line[i + 1] == 'r' || line[i + 1] == 't') {
++ i;
continue;
}
else return 0;
}
continue;
}
return 0;
}
return 0;
}
//+ $comment
int match_lex_comment(const string & line, int start) {
if (line[start] != '#') return 0;
return line.size() - start;
}
LineLex line_lex;
LineTree line_tree;
TextMatcher t_m_0("'let'", "let");
TextMatcher t_m_1("'='", "=");
TextMatcher t_m_2("'def'", "def");
TextMatcher t_m_3("'('", "(");
TextMatcher t_m_4("')'", ")");
TextMatcher t_m_5("'if'", "if");
TextMatcher t_m_6("'else'", "else");
TextMatcher t_m_7("'for'", "for");
TextMatcher t_m_8("'in'", "in");
TextMatcher t_m_9("'while'", "while");
TextMatcher t_m_10("'return'", "return");
TextMatcher t_m_11("','", ",");
SymbolMatcher s_m_0("$int", match_lex_int);
SymbolMatcher s_m_1("$float", match_lex_float);
SymbolMatcher s_m_2("$op", match_lex_op);
SymbolMatcher s_m_3("$id", match_lex_id);
SymbolMatcher s_m_4("$text", match_lex_text);
SymbolMatcher s_m_5("$comment", match_lex_comment);
void init_rules() {
line_lex.matchers.push_back(&t_m_0);
line_lex.matchers.push_back(&t_m_1);
line_lex.matchers.push_back(&t_m_2);
line_lex.matchers.push_back(&t_m_3);
line_lex.matchers.push_back(&t_m_4);
line_lex.matchers.push_back(&t_m_5);
line_lex.matchers.push_back(&t_m_6);
line_lex.matchers.push_back(&t_m_7);
line_lex.matchers.push_back(&t_m_8);
line_lex.matchers.push_back(&t_m_9);
line_lex.matchers.push_back(&t_m_10);
line_lex.matchers.push_back(&t_m_11);
line_lex.matchers.push_back(&s_m_0);
line_lex.matchers.push_back(&s_m_1);
line_lex.matchers.push_back(&s_m_2);
line_lex.matchers.push_back(&s_m_3);
line_lex.matchers.push_back(&s_m_4);
line_lex.matchers.push_back(&s_m_5);
line_tree.add_rule(Rule(NULL, "#let", "'let'", "$id", "'='", "exp"));
line_tree.add_rule(Rule(NULL, "#def", "'def'", "$id", "'('", "')'"));
line_tree.add_rule(Rule(NULL, "#def", "'def'", "$id", "'('", "id_list", "')'"));
line_tree.add_rule(Rule(NULL, "#if", "'if'", "exp"));
line_tree.add_rule(Rule(NULL, "#call", "fun_call"));
line_tree.add_rule(Rule(NULL, "#else", "'else'"));
line_tree.add_rule(Rule(NULL, "#for", "'for'", "$id", "'in'", "exp"));
line_tree.add_rule(Rule(NULL, "#while", "'while'", "exp"));
line_tree.add_rule(Rule(NULL, "#assign", "$id", "'='", "exp"));
line_tree.add_rule(Rule(NULL, "#return", "'return'", "exp"));
line_tree.add_rule(Rule(NULL, "#comment", "$comment"));
line_tree.add_rule(Rule(NULL, "s_exp", "$int"));
line_tree.add_rule(Rule(NULL, "s_exp", "$float"));
line_tree.add_rule(Rule(NULL, "s_exp", "$id"));
line_tree.add_rule(Rule(NULL, "exp", "s_exp", "$op", "s_exp"));
line_tree.add_rule(Rule(NULL, "exp", "$op", "s_exp"));
line_tree.add_rule(Rule(NULL, "s_exp", "'('", "exp", "')'"));
line_tree.add_rule(Rule(NULL, "s_exp", "fun_call"));
line_tree.add_rule(Rule(NULL, "s_exp", "$text"));
line_tree.add_rule(Rule(NULL, "exp", "s_exp"));
line_tree.add_rule(Rule(NULL, "id_list", "$id"));
line_tree.add_rule(Rule(NULL, "id_list", "id_list", "','", "$id"));
line_tree.add_rule(Rule(NULL, "exp_list", "exp"));
line_tree.add_rule(Rule(NULL, "exp_list", "exp_list", "','", "exp"));
line_tree.add_rule(Rule(NULL, "fun_call", "$id", "'('", "')'"));
line_tree.add_rule(Rule(NULL, "fun_call", "$id", "'('", "exp_list", "')'"));
}
void parse(const char filename[]) {
/*
ifstream infile(filename);
if (infile.fail()) {
cout << "Cannot open file " << filename << endl;
return;
}
*/
string line;
vector<vector<Span> > spans;
int i = 0;
//while (getline(infile, line)) {
while (getline(cin, line)) {
spans.push_back(vector<Span>());
line_lex.match(line, i, spans[i]);
++ i;
}
//infile.close();
vector<Span> line_spans;
I(i, spans.size()) {
if (spans[i].size() > 0) {
Span line = line_tree.reduce(spans[i]);
line_spans.push_back(line);
cout << line.linearize() << endl;
}
}
}
int main(int argc, char * argv[]) {
init_rules();
/*
if (argc != 2) {
cout << "No input file" << endl;
return 1;
}
parse(argv[1]);
*/
parse(NULL);
}