compiler_2 - 程序设计

词法分析器+行语法分析器
主程序 (C/C++)

#include "source.cpp"

随机测试文本产生程序 (C)

#include <stdio.h>

int main() {
    printf("\
let x = 10\n\
\n\
def fun1()\n\
    let x = 0.5\n\
    for i in 10\n\
        fun2(2,i,7)\n\
        x = x + i\n\
    while x > 10\n\
        x = x // 2\n\
    return x\n\
\n\
# comment\n\
def fun2(x, y, z)\n\
    if (x < y) and (not (z < y))\n\
        print(x,y,z,'in order')\n\
    else\n\
        print(x,y,z,\"out of order\")\n\
\n\
print(fun1())\n");
}

答案 (C/C++与主程序相同)

// util

#include <string>
#include <vector>
#include <iostream>
#include <sstream>
using namespace std;

#define I(i,n) for (int i = 0; i < (n); ++ i)

template <typename E>
struct _Node
{
    E elem;
    _Node * next;
    _Node * prev;
};

class NotSuchElementException {};

template <typename E>
class LinkedList
{
    int _size;
    _Node<E> * head;
    _Node<E> * tail;

public:
    LinkedList() : _size(0) {}

    int size() const { return _size; }

    E & getFirst() const {
        if (_size == 0) throw NotSuchElementException();
        return head->elem;
    }

    E & getLast() const {
        if (_size == 0) throw NotSuchElementException();
        return tail->elem;
    }

    E removeFirst() {
        if (_size == 0) throw NotSuchElementException();
        _Node<E> * tmp = head;
        head = head->next;
        -- _size;
        E e = tmp->elem;
        delete tmp;
        return e;
    }

    E removeLast() {
        if (_size == 0) throw NotSuchElementException();
        _Node<E> * tmp = tail;
        tail = tail->prev;
        -- _size;
        E e = tmp->elem;
        delete tmp;
        return e;
    }

    void addFirst(const E & elem) {
        _Node<E> * tmp = new _Node<E>;
        tmp->elem = elem;
        tmp->next = head;
        if (_size > 0) head->prev = tmp;
        head = tmp;
        if (_size == 0) tail = tmp;
        ++ _size;
    }

    void addLast(const E & elem) {
        _Node<E> * tmp = new _Node<E>;
        tmp->elem = elem;
        tmp->prev = tail;
        if (_size > 0) tail->next = tmp;
        tail = tmp;
        if (_size == 0) head = tmp;
        ++ _size;
    }

    void clear() {
        while (_size > 0) removeFirst();
    }

    ~LinkedList() {
        clear();
    }

    operator vector<E>() const {
        vector<E> vec;
        _Node<E> * tmp = head;
        I(i, _size) {
            vec.push_back(tmp->elem);
            tmp = tmp->next;
        }
        return vec;
    }

    void copy_list(const vector<E> & vec) {
        clear();
        I(i, vec.size()) {
            addLast(vec[i]);
        }
    }

    LinkedList(const vector<E> & vec) : _size(0) {
        copy_list(vec);
    }

    LinkedList(const LinkedList & list) : _size(0) {
        copy_list(vector<E>(list));
    }

    LinkedList & operator = (const LinkedList & list) {
        copy_list(vector<E>(list));
        return *this;
    }

};

class NoSuchKeyException {};

template <typename I>
I circular_shift(I bits, int shift_bits) {
    I num_bits = 8 * sizeof(I);
    shift_bits = shift_bits % num_bits;
    return bits << shift_bits | bits >> (num_bits - shift_bits);
}

int operator%(const string & key, int mod) {
    size_t code = 0;
    I(i, key.size())
        code ^= circular_shift(key[i], i * 5);
    return code % mod;
}

// 如果需要把类型 K 作为键，需要有：
//    1) int operater%(K, int)
//    2) bool K::operater==(K)

template <typename K, typename T>
class _Hash
{
    vector<T> tuples;
    size_t _size;

    size_t canonical_index(const K & key) const {
        return key % tuples.size();
    }

    // the returned tuple is either empty or with the same key
    size_t indexOfKey(const K & key) const {
        size_t index = canonical_index(key);
        while (true) { // will enter a dead loop if there is not empty tuple
            if (! tuples[index].in_use) return index;
            if (tuples[index].key == key) return index;
            index = (index + 1) % tuples.size();
        }
    }

public:
    _Hash() : _size(0) {
        tuples.resize(2);
    }

    bool hasKey(const K & key) const {
        size_t index = indexOfKey(key);
        return tuples[index].in_use;
    }

    T & get(const K & key) {
        size_t index = indexOfKey(key);
        if (! tuples[index].in_use) throw NoSuchKeyException();
        return tuples[index];
    }

    const T & get(const K & key) const {
        size_t index = indexOfKey(key);
        if (! tuples[index].in_use) throw NoSuchKeyException();
        return tuples[index];
    }

    void put(const T & tuple) {
        size_t index = indexOfKey(tuple.key);
        bool in_use = tuples[index].in_use;
        tuples[index] = tuple;
        tuples[index].in_use = true;
        if (in_use) return;
        ++ _size;
        // make sure empty > size / 2
        if (2 * _size > tuples.size())
            _double_tuples();
    }

    void _double_tuples() {
        vector<T> non_empty_tuples;
        I(i, tuples.size())
            if (tuples[i].in_use)
                non_empty_tuples.push_back(tuples[i]);
        tuples.resize(tuples.size() * 2);
        clear();
        I(i, non_empty_tuples.size())
            put(non_empty_tuples[i]);
    }

    static bool _between(size_t hole, size_t canonical, size_t tuple_index) {
        if (hole < tuple_index)
            return canonical > hole && canonical <= tuple_index;
        else
            return canonical > hole || canonical <= tuple_index;
    }

    void remove(const K & key) { 
        size_t index = indexOfKey(key);
        if (! tuples[index].in_use) 
            throw NoSuchKeyException();
        tuples[index].in_use = false;
        -- _size;
        // if there is a hole between a tuple and its canonical position
        // move the tuple to the hole
        size_t hole = index;
        size_t tuple_index = hole;
        while (true) {
            tuple_index = (tuple_index + 1) % tuples.size();
            if (! tuples[tuple_index].in_use) return;
            size_t canonical = canonical_index(tuples[tuple_index].key);
            if (_between(hole, canonical, tuple_index)) continue;
            tuples[hole] = tuples[tuple_index];
            hole = tuple_index;
            tuples[hole].in_use = false;
        }
    }

    size_t size() const { return _size; }

    void clear() {
        I(i, tuples.size())
            tuples[i].in_use = false;
        _size = 0;
    }

    vector<K> keys() const {
        vector<K> ks;
        I(i, tuples.size())
            if (tuples[i].in_use)
                ks.push_back(tuples[i].key);
        return ks;
    }

};

template <typename K, typename V>
class HashTable
{
    class _Tuple
    {
    public:
        K key;
        V val;
        bool in_use;

        _Tuple() : in_use(false) {}
        _Tuple(const K & k, const V & v) : in_use(false), key(k), val(v) {}
    };

    _Hash<K,_Tuple> _hash;

public:
    bool hasKey(const K & key) const {
        return _hash.hasKey(key);
    }

    size_t size() const { return _hash.size(); }

    void clear() {
        _hash.clear();
    }

    const V & get(const K & k) const {
        return _hash.get(k).val;
    }

    V & get(const K & k) {
        return _hash.get(k).val;
    }

    void put(const K & k, const V & v) {
        _hash.put(_Tuple(k, v));
    }

    void remove(const K & k) {
        _hash.remove(k);
    }

    vector<K> keys() const {
        return _hash.keys();
    }

};


template <typename E>
class HashSet
{
    class _Tuple
    {
    public:
        E key;
        bool in_use;

        _Tuple() : in_use(false) {}
        _Tuple(const E & e) : in_use(false), key(e) {}
    };

    _Hash<E,_Tuple> _hash;

public:
    bool has(const E & e) const {
        return _hash.hasKey(e);
    }

    size_t size() const { return _hash.size(); }

    void clear() {
        _hash.clear();
    }

    void add(const E & e) {
        _hash.put(_Tuple(e));
    }

    void remove(const E & e) {
        _hash.remove(e);
    }

    operator vector<E>() const {
        return _hash.keys();
    }

};

// parse

class CompilationError : public runtime_error
{
public:
    CompilationError(const char err_msg[]) : runtime_error(err_msg) {}
};

string _normalize_braket(string text) {
    string text2 = text;
    I(i, text2.size())
        switch (text[i]) {
        case '(': text2[i] = '['; break;
        case ')': text2[i] = ']'; break;
        }
    return text2;
}

class Span
{
public:
    int line_no;
    int prev_end;
    int char_start;
    int char_end;
    string type;
    string text;
    string key;
    vector<Span *> children;

    Span() {}

    Span(int line_no, int prev_end, int char_start, int char_end, 
            const string & type, const string & text) {
        this->line_no = line_no;
        this->prev_end = prev_end;
        this->char_start = char_start;
        this->char_end = char_end;
        this->type = type;
        this->text = text;
        stringstream ss;
        ss<<type<<"("<<char_start<<","<<char_end<<")";
        this->key = ss.str();
    }

    string linearize() const {
        stringstream ss;
        ss<<"("<<_normalize_braket(type);
        if (children.size() == 0)
            ss<<" "<<_normalize_braket(text);
        else {
            I(i, children.size())
                ss<<" "<<children[i]->linearize();
        }
        ss<<")";
        return ss.str();
    }
};

typedef int (* matcher)(const string & line, int start);

class _Matcher
{
public:
    string type;
    virtual int match(const string & line, int start) const = 0;
};

class TextMatcher : public _Matcher
{
    string text;
public:
    TextMatcher(const string & type, const string & text) {
        this->type = type;
        this->text = text;
    }

    int match(const string & line, int start) const {
        string word = line.substr(start, this->text.size());
        return (word == this->text) ? this->text.size() : 0;
    }
};

class SymbolMatcher : public _Matcher
{
    matcher func;
public:
    SymbolMatcher(const string & type, matcher func) {
        this->type = type;
        this->func = func;    
    }

    int match(const string & line, int start) const {
        return this->func(line, start);
    }
};

class LineLex
{
public:
    vector<_Matcher *> matchers;

    void match(const string & line, int line_no, vector<Span> & spans) {
        int prev_end = 0;
        for (int i = 0; i < line.size(); ++ i) {
            if (line[i] == ' ' || line[i] == '\t') continue;
            int max_match_size = -1;
            string match_type;
            for (int j = 0; j < this->matchers.size(); ++ j) {
                int match_size = this->matchers[j]->match(line, i);
                if (match_size > 0 && match_size > max_match_size) {
                    max_match_size = match_size;
                    match_type = this->matchers[j]->type;
                }
            }
            if (max_match_size == -1) {
                stringstream ss;
                ss<<"Lexical error: "<<line.substr(i)<<" line# "<<(line_no+1)<<" char#"<<(i+1);
                throw CompilationError(ss.str().c_str());
            }
            spans.push_back(Span(line_no, prev_end, i, i+max_match_size,
                                match_type, line.substr(i, max_match_size)));
            prev_end = i+max_match_size;
            i += max_match_size - 1;
        }
    }
};


typedef void (* line_action_func)(Span & span);

class Rule
{
public:
    line_action_func action;
    string left;
    vector<string> right;
    string key;

    Rule() {}

    Rule(line_action_func action, const string & left, const string & right1,
        const string & right2="", const string & right3="", 
        const string & right4="", const string & right5="") {
        this->action = action;
        this->left = left;
        this->right.push_back(right1);
        if (right2 != "") this->right.push_back(right2);
        if (right3 != "") this->right.push_back(right3);
        if (right4 != "") this->right.push_back(right4);
        if (right5 != "") this->right.push_back(right5);
        stringstream ss;
        ss <<this->left<<" ->";
        I(i, right.size())
            ss<<" "<< right[i];
        this->key = ss.str();
    }
};

class LineTree
{
    HashSet<string> span_set;
    vector<Span *> all_spans;
    HashTable<string, vector<Rule *> > rule_end_at;
    HashTable<int, vector<Span *> > spans_end_at;
    LinkedList<Span *> to_reduce;
    bool _inited;
public:
    HashTable<string, Rule> rules;

    LineTree() {
        _reset();
        _inited = false;
    }

    ~LineTree() {
        _reset();
    }

    void add_rule(const Rule & rule) {
        if (_inited) throw CompilationError("LineTree is inited and fixed");
        rules.put(rule.key, rule);
    }

private:
    void _init() {
        if (_inited) return;
        _inited = true;
        vector<string> rule_keys = rules.keys();
        I(i, rule_keys.size()) {
            string key = rule_keys[i];
            Rule & rule = rules.get(key);
            string type = rule.right[rule.right.size() - 1];
            if (! rule_end_at.hasKey(type))
                rule_end_at.put(type, vector<Rule *>());
            vector<Rule *> & rule_end_at1 = rule_end_at.get(type);
            rule_end_at1.push_back(&rule);
        }
    }

    void _reset() {
        I(i, all_spans.size())
            delete all_spans[i];
        all_spans.clear();
        span_set.clear();
        spans_end_at.clear();
        to_reduce.clear();
    }

    void _try_match_rule(const Rule & rule, Span & span, LinkedList<Span *> matched_list) {
        matched_list.addFirst(&span);
        if (matched_list.size() == rule.right.size()) {
            Span & end = *matched_list.getLast();
            Span * matched = new Span(span.line_no,
                span.prev_end,span.char_start,end.char_end,rule.left,rule.key);
            matched->children = matched_list;
            all_spans.push_back(matched);
            if (span_set.has(matched->key))
                _report_ambiguity_error(*matched);
            span_set.add(matched->key);
            to_reduce.addLast(matched);
            spans_end_at.get(matched->char_end).push_back(matched);
        }
        else {
            string prev_type = rule.right[rule.right.size()-1-matched_list.size()];
            if (spans_end_at.hasKey(span.prev_end)) {
                vector<Span *> & span_end_at1 = spans_end_at.get(span.prev_end);
                I(i, span_end_at1.size())
                    if (span_end_at1[i]->type == prev_type)
                        _try_match_rule(rule, *span_end_at1[i], matched_list);
            }
        }
    }

    void _add_span(Span & span) {
        to_reduce.addLast(&span);
        vector<Span *> span_end_at;
        span_end_at.push_back(&span);
        spans_end_at.put(span.char_end, span_end_at);
        while (to_reduce.size() > 0) {
            Span * to_reduce1 = to_reduce.removeFirst();
            if (! rule_end_at.hasKey(to_reduce1->type)) continue;
            vector<Rule *> & rules = rule_end_at.get(to_reduce1->type);
            LinkedList<Span *> matched_list;
            I(i, rules.size())
                _try_match_rule(*rules[i], *to_reduce1, matched_list);
        }
    }

    Span _get_result(int end, vector<Span> & spans) {
        vector<Span *> & span_end_at1 = spans_end_at.get(end);
        Span * res = 0;
        I(i, span_end_at1.size())
            if (span_end_at1[i]->prev_end == 0 && span_end_at1[i]->type[0] == '#') {
                if (res == 0) res = span_end_at1[i];
                else _report_ambiguity_error(*res);
            }
        if (res == 0)
            _report_syntax_error(spans);
        return *res;
    }

    void _report_ambiguity_error(Span & matched) {
        stringstream ss;
        ss<<"Ambiguous span "<<matched.key<< " line:"<<(matched.line_no+1);
        throw CompilationError(ss.str().c_str());
    }

    void _report_syntax_error(vector<Span> & spans) {
        stringstream ss;
        ss<<"Syntax error at line:"<<(spans[0].line_no+1);
        throw CompilationError(ss.str().c_str());
    }

public:
    Span reduce(vector<Span> & spans) {
        _init();
        _reset();
        I(i, spans.size()) {
            _add_span(spans[i]);
        }
        return _get_result(spans[spans.size()-1].char_end, spans);
    }
};

// line

//+ $int
int match_lex_int(const string & line, int start) {
    for (int i = start; i < line.size(); ++ i) {
        if (line[i] >= '0' && line[i] <= '9') continue;
        return i - start;
    }
    return line.size() - start;
}

//+ $float
int match_lex_float(const string & line, int start) {
    int num_dot = 0;
    for (int i = start; i < line.size(); ++ i) {
        if (line[i] == '.') {
            num_dot += 1;
            continue;
        }
        if (line[i] >= '0' && line[i] <= '9') continue;
        return i - start;
    }
    if (num_dot != 1) return 0;
    return line.size() - start;
}

const char * op[] = {"and", "not", "or", "xor", "+", "-", "*", "/", "//", 
            "~", "%", "^", "&", "|", ">", "<", ">=", "<=", "==", "!="};

bool op_has(const string & text) {
    I(i, sizeof(op)/sizeof(char *))
        if (text == op[i]) return true;
    return false;
}
//+ $op
int match_lex_op(const string & line, int start) {
    if (op_has(line.substr(start, 3))) return 3;
    if (op_has(line.substr(start, 2))) return 2;
    if (op_has(line.substr(start, 1))) return 1;
    return 0;
}

//+ $id
int match_lex_id(const string & line, int start) {
    for (int i = start; i < line.size(); ++ i) {
        if (i != start && line[i] >= '0' && line[i] <= '9') continue;
        if (line[i] == '_') continue;
        if (line[i] >= 'a' && line[i] <= 'z') continue;
        if (line[i] >= 'A' && line[i] <= 'Z') continue;
        return i - start;
    }
    return line.size() - start;
}

//+ $text
int match_lex_text(const string & line, int start) {
    char delimitor = ' ';
    for (int i = start; i < line.size(); ++ i) {
        if (i == start) {
            if (line[i] == '"' || line[i] == '\'') {
                delimitor = line[i];
                continue;
            }
        }
        else {
            if (line[i] == delimitor) {
                return i + 1 - start;
            }
            if (line[i] == '\\') {
                if (i + 1 == line.size()) return 0;
                if (line[i + 1] == '\\' || line[i + 1] == delimitor || 
                    line[i + 1] == 'n' || line[i + 1] == 'r' || line[i + 1] == 't') {
                    ++ i;
                    continue;
                }
                else return 0;
            }
            continue;
        }
        return 0;
    }
    return 0;
}

//+ $comment
int match_lex_comment(const string & line, int start) {
    if (line[start] != '#') return 0;
    return line.size() - start;
}

LineLex line_lex;
LineTree line_tree;

TextMatcher t_m_0("'let'", "let");
TextMatcher t_m_1("'='", "=");
TextMatcher t_m_2("'def'", "def");
TextMatcher t_m_3("'('", "(");
TextMatcher t_m_4("')'", ")");
TextMatcher t_m_5("'if'", "if");
TextMatcher t_m_6("'else'", "else");
TextMatcher t_m_7("'for'", "for");
TextMatcher t_m_8("'in'", "in");
TextMatcher t_m_9("'while'", "while");
TextMatcher t_m_10("'return'", "return");
TextMatcher t_m_11("','", ",");

SymbolMatcher s_m_0("$int", match_lex_int);
SymbolMatcher s_m_1("$float", match_lex_float);
SymbolMatcher s_m_2("$op", match_lex_op);
SymbolMatcher s_m_3("$id", match_lex_id);
SymbolMatcher s_m_4("$text", match_lex_text);
SymbolMatcher s_m_5("$comment", match_lex_comment);

void init_rules() {
    line_lex.matchers.push_back(&t_m_0);
    line_lex.matchers.push_back(&t_m_1);
    line_lex.matchers.push_back(&t_m_2);
    line_lex.matchers.push_back(&t_m_3);
    line_lex.matchers.push_back(&t_m_4);
    line_lex.matchers.push_back(&t_m_5);
    line_lex.matchers.push_back(&t_m_6);
    line_lex.matchers.push_back(&t_m_7);
    line_lex.matchers.push_back(&t_m_8);
    line_lex.matchers.push_back(&t_m_9);
    line_lex.matchers.push_back(&t_m_10);
    line_lex.matchers.push_back(&t_m_11);

    line_lex.matchers.push_back(&s_m_0);
    line_lex.matchers.push_back(&s_m_1);
    line_lex.matchers.push_back(&s_m_2);
    line_lex.matchers.push_back(&s_m_3);
    line_lex.matchers.push_back(&s_m_4);
    line_lex.matchers.push_back(&s_m_5);

    line_tree.add_rule(Rule(NULL, "#let", "'let'", "$id", "'='", "exp"));
    line_tree.add_rule(Rule(NULL, "#def", "'def'", "$id", "'('", "')'"));
    line_tree.add_rule(Rule(NULL, "#def", "'def'", "$id", "'('", "id_list", "')'"));
    line_tree.add_rule(Rule(NULL, "#if", "'if'", "exp"));
    line_tree.add_rule(Rule(NULL, "#call", "fun_call"));
    line_tree.add_rule(Rule(NULL, "#else", "'else'"));
    line_tree.add_rule(Rule(NULL, "#for", "'for'", "$id", "'in'", "exp"));
    line_tree.add_rule(Rule(NULL, "#while", "'while'", "exp"));
    line_tree.add_rule(Rule(NULL, "#assign", "$id", "'='", "exp"));
    line_tree.add_rule(Rule(NULL, "#return", "'return'", "exp"));
    line_tree.add_rule(Rule(NULL, "#comment", "$comment"));
    line_tree.add_rule(Rule(NULL, "s_exp", "$int"));
    line_tree.add_rule(Rule(NULL, "s_exp", "$float"));
    line_tree.add_rule(Rule(NULL, "s_exp", "$id"));
    line_tree.add_rule(Rule(NULL, "exp", "s_exp", "$op", "s_exp"));
    line_tree.add_rule(Rule(NULL, "exp", "$op", "s_exp"));
    line_tree.add_rule(Rule(NULL, "s_exp", "'('", "exp", "')'"));
    line_tree.add_rule(Rule(NULL, "s_exp", "fun_call"));
    line_tree.add_rule(Rule(NULL, "s_exp", "$text"));
    line_tree.add_rule(Rule(NULL, "exp", "s_exp"));
    line_tree.add_rule(Rule(NULL, "id_list", "$id"));
    line_tree.add_rule(Rule(NULL, "id_list", "id_list", "','", "$id"));
    line_tree.add_rule(Rule(NULL, "exp_list", "exp"));
    line_tree.add_rule(Rule(NULL, "exp_list", "exp_list", "','", "exp"));
    line_tree.add_rule(Rule(NULL, "fun_call", "$id", "'('", "')'"));
    line_tree.add_rule(Rule(NULL, "fun_call", "$id", "'('", "exp_list", "')'"));

}


void parse(const char filename[]) {
    /*
    ifstream infile(filename);
    if (infile.fail()) {
        cout << "Cannot open file " << filename << endl;
        return;
    }
    */
    string line;
    vector<vector<Span> > spans;
    int i = 0;
    //while (getline(infile, line)) {
    while (getline(cin, line)) {
        spans.push_back(vector<Span>());
        line_lex.match(line, i, spans[i]);
        ++ i;
    }
    //infile.close();

    vector<Span> line_spans;
    I(i, spans.size()) {
        if (spans[i].size() > 0) {
            Span line = line_tree.reduce(spans[i]);
            line_spans.push_back(line);
            cout << line.linearize() << endl;
        }
    }
}

int main(int argc, char * argv[]) {
    init_rules();
    /*
    if (argc != 2) {
        cout << "No input file" << endl;
        return 1;
    }
    parse(argv[1]);
    */
    parse(NULL);
}