Boost.Spirit SQL-Grammatik- / Lexer-Fehler

Ich habe zwei Probleme mit der folgenden SQL-Grammatik:

#define BOOST_SPIRIT_QI_DEBUG

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp> 

#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>

#include <iostream>
#include <fstream>
#include <string>
#include <set>
#include <utility>

namespace bs = boost::spirit;
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

// Token definition base, defines all tokens for the base grammar below
template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
public:
    // Tokens with no attributes.
    lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date;
    lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create,
        kw_table, kw_constraint, kw_primary_key;

    // Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
    lex::token_def<int> signed_digit;
    lex::token_def<std::size_t> unsigned_digit;
    lex::token_def<std::string> identifier;
    lex::token_def<std::string> quoted_string;

    sql_tokens()
    {
        // Column data types.
        type_smallint = "(?i:smallint)";
        type_int = "(?i:int)";
        type_varchar = "(?i:varchar)";
        type_text = "(?i:text)";
        type_date = "(?i:date)";

        // Keywords.
        kw_not_null = "(?i:not +null)";
        kw_auto_increment = "(?i:auto_increment)";
        kw_unique = "(?i:unique)";
        kw_default = "(?i:default)";
        kw_create = "(?i:create)";
        kw_table = "(?i:table)";
        kw_constraint = "(?i:constraint)";
        kw_primary_key = "(?i:primary +key)";

        // Values.
        signed_digit = "[+-]?[0-9]+";
        unsigned_digit = "[0-9]+";
        quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"

        // Identifier.
        identifier = "[a-zA-Z][a-zA-Z0-9_]*";

        // The token must be added in priority order.
        this->self += lex::token_def<>('(') | ')' | ',' | ';';
        this->self += type_smallint | type_int | type_varchar | type_text |
                                    type_date;
        this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
                                    kw_create | kw_table | kw_constraint | kw_primary_key;
        this->self += identifier | unsigned_digit | signed_digit | quoted_string;

        // define the whitespace to ignore.
        this->self("WS")
                =       lex::token_def<>("[ \\t\\n]+") 
                |       "--[^\\n]*\\n"  // Single line comments with --
                |       "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // C-style comments
                ;
    }
};

// Grammar definition, define a little part of the SQL language.
template <typename Iterator, typename Lexer>
struct sql_grammar 
    : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
    template <typename TokenDef>
    sql_grammar(TokenDef const& tok)
        : sql_grammar::base_type(program, "program")
    {
        program 
            =  (statement % ';') >> *qi::lit(';')
            ;

        statement 
            =   create_statement.alias()
            ;

        create_statement
            =   tok.kw_create >> create_table
            ;

        create_table
            =   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
            ;

        table_constraints
            =   constraint_definition % ','
            ;

        constraint_definition
            = tok.kw_constraint >> tok.identifier >> primary_key_constraint
            ;

        primary_key_constraint
            = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
            ;

        create_table_columns
            =   column_definition % ','
            ;

        column_definition
            =   tok.identifier >> column_type >> *type_constraint
            ;

        type_constraint
            =   tok.kw_not_null
            |   tok.kw_auto_increment
            |   tok.kw_unique
            |   default_value
            ;

        default_value
            =   tok.kw_default > tok.quoted_string
            ;

        column_type
            =   tok.type_smallint
            |   tok.type_int
            |   (tok.type_varchar > '(' > tok.unsigned_digit > ')') 
            |   tok.type_text
            |   tok.type_date
            ;

        program.name("program");
        statement.name("statement");
        create_statement.name("create statement");
        create_table.name("create table");
        create_table_columns.name("create table columns");
        column_definition.name("column definition");
        column_type.name("column type");
        default_value.name("default value");
        type_constraint.name("type constraint");
        table_constraints.name("table constraints");
        constraint_definition.name("constraint definition");
        primary_key_constraint.name("primary key constraint");

        BOOST_SPIRIT_DEBUG_NODE(program);
        BOOST_SPIRIT_DEBUG_NODE(statement);
        BOOST_SPIRIT_DEBUG_NODE(create_statement);
        BOOST_SPIRIT_DEBUG_NODE(create_table);
        BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
        BOOST_SPIRIT_DEBUG_NODE(column_definition);
        BOOST_SPIRIT_DEBUG_NODE(column_type);
        BOOST_SPIRIT_DEBUG_NODE(default_value);
        BOOST_SPIRIT_DEBUG_NODE(type_constraint);
        BOOST_SPIRIT_DEBUG_NODE(table_constraints);
        BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
        BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);

        using namespace qi::labels;
        qi::on_error<qi::fail>
        (
            program,
            std::cout
                << phx::val("Error! Expecting ")
                << bs::_4                               // what failed?
                << phx::val(" here: \"")
                << phx::construct<std::string>(bs::_3, bs::_2)   // iterators to error-pos, end
                << phx::val("\"")
                << std::endl
        );
    }

private:
    typedef qi::in_state_skipper<Lexer> skipper_type;
    typedef qi::rule<Iterator, skipper_type> simple_rule;

    simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
    simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
};

std::string file2string(const std::string& filename)
{
    std::ifstream s(filename.c_str(), std::ios_base::binary);
    std::stringstream ss;
    ss << s.rdbuf();
    return ss.str();
}

int main(int argc, char* argv[])
{
    if(argc != 2)
    {
        std::cerr << "usage: " << argv[0] << " schema_filename\n";
        return 1;
    }

    // iterator type used to expose the underlying input stream
    typedef std::string::iterator base_iterator_type;

    // This is the lexer token type to use.
    typedef lex::lexertl::token<
        base_iterator_type, boost::mpl::vector<int, std::size_t, std::string> 
    > token_type;

    // Here we use the lexertl based lexer engine.
    typedef lex::lexertl::lexer<token_type> lexer_type;

    // This is the token definition type (derived from the given lexer type).
    typedef sql_tokens<lexer_type> sql_tokens;

    // this is the iterator type exposed by the lexer 
    typedef sql_tokens::iterator_type iterator_type;

    // this is the type of the grammar to parse
    typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;

    // now we use the types defined above to create the lexer and grammar
    // object instances needed to invoke the parsing process
    sql_tokens tokens;                         // Our lexer
    sql_grammar sql(tokens);                  // Our parser

    std::string str(file2string(argv[1]));

    // At this point we generate the iterator pair used to expose the
    // tokenized input stream.
    base_iterator_type it = str.begin();
    iterator_type iter = tokens.begin(it, str.end());
    iterator_type end = tokens.end();

    // Parsing is done based on the the token stream, not the character 
    // stream read from the input.
    // Note how we use the lexer defined above as the skip parser. It must
    // be explicitly wrapped inside a state directive, switching the lexer 
    // state for the duration of skipping whitespace.
    std::string ws("WS");
    bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);

    if (r && iter == end)
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing succeeded\n";
        std::cout << "-------------------------\n";
    }
    else
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing failed\n";
        std::cout << "-------------------------\n";
    }
    return 0;
}
Problem 1: Beginnen Sie mit Kommentaren

Wenn die Datei mit einem Kommentar beginnt, schlägt das Parsen sofort fehl:

/* bouh */

CREATE TABLE mytable (
  id int NOT NULL AUTO_INCREMENT
);

Mit diesem fehlerhaften Baum:

<program>
  <try>[/]</try>
  <statement>
    <try>[/]</try>
    <create_statement>
      <try>[/]</try>
      <fail/>
    </create_statement>
    <fail/>
  </statement>
  <fail/>
</program>

Aber wenn ich kurz zuvor einen Zeilenumbruch hinzufüge, funktioniert es. Beide Kommentartypen ("-" und "/ ** /") schlagen fehl.

Problem 2: Das eindeutige Schlüsselwort wird nicht erkannt

Das Parsen schlägt unter sehr spezifischen Bedingungen mit dem Schlüsselwort unique fehl. Es funktioniert nicht, wenn unique in Großbuchstaben und direkt gefolgt von einem Komma steht.

Alle folgenden Fälle sind erfolgreich:

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL unique,
  s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE ,
  s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint UNIQUE NOT NULL,
  s int NOT NULL UNIQUE
);

Aber das tut man nicht:

-- Fail
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE,
  s int NOT NULL
);

Haben Sie eine Vorstellung davon, was falsch ist? Vielen Dank!

Antworten auf die Frage(1)

Ihre Antwort auf die Frage