#ifndef WONKY_LEXER_C
#define WONKY_LEXER_C WONKY_LEXER_C
/*asdf*/#include <lexer.h>
struct Preprocessing_Translation_Unit* lex(struct Source_File *src,struct Program *program)
{
struct Lexer_Data *lexer_data;
struct Preprocessing_Translation_Unit *lexed_unit;
lexer_data=get_lexer_data(src,program);
lexed_unit=lex_inner(lexer_data);
Map_Push(
program->preprocessing_translation_units,
src->src_name->name,
src->src_name->name_size,
lexed_unit
);
delete_lexer_data(lexer_data);
if(lexed_unit->tokens->size==0)
{
push_generic_error(program,"Empty translation unit");
return NULL;
}
return lexed_unit;
}
struct Preprocessing_Translation_Unit* lex_inner(struct Lexer_Data *lexer_data)
{
struct Preprocessing_Translation_Unit *unit;
struct token *token;
unit=get_preprocessing_translation_unit(lexer_data->src);
while(!lexer_eof(lexer_data))
{
token=lexer_extract_next_token(lexer_data);
if(token!=NULL)
push_token_into_preprocessing_translation_unit(unit,token);
}
return unit;
}
struct Lexer_Data* get_lexer_data(struct Source_File *src,struct Program *program)
{
struct Lexer_Data *ret;
ret=wonky_malloc(sizeof(struct Lexer_Data));
ret->where_in_src=0;
ret->which_column=0;
ret->which_row=0;
ret->is_in_the_begining_of_line=1;
ret->is_in_if_directive_body=0;
ret->automata_view=AUTOMATA_VIEW_NORMAL;
ret->src=src;
ret->program=program;
ret->previous_token_location=&start_of_file;
ret->buffer_token=NULL;
return ret;
}
struct token* lex_token_from_string(char *str,size_t string_length)
{
struct Automata_Node *head;
struct Automata_Node *follower;
struct token *ret;
wonky_assert(str && string_length);
head=&chonky[0];
follower=NULL;
}
void lexer_skip_white_space(struct Lexer_Data *lexer_data)
{
size_t where_in_src=lexer_data->where_in_src;
size_t which_line=lexer_data->which_row;
size_t which_column=lexer_data->which_column;
enum Source_Chars ch;
for(ch=CHAR_SPACE;
(ch==CHAR_SPACE || ch==CHAR_VERTICAL_TAB || ch==CHAR_HORISONTAL_TAB || ch==CHAR_FORM_FEED_TAB) && !lexer_eof(lexer_data);
ch=lexer_get_ch_accounting_for_linesplices_and_comments(
lexer_data->program,
lexer_data->src->src,
lexer_data->src->src_size,
&where_in_src,
&which_line,
&which_column)
)
{
if(ch==CHAR_FORM_FEED_TAB)
lexer_data->is_in_the_begining_of_line=1;
lexer_data->where_in_src=where_in_src;
lexer_data->which_row=which_line;
lexer_data->which_column=which_column;
}
}
inline _Bool lexer_eof(struct Lexer_Data *lexer_data)
{
#warning might want a lexer_skip_white_space(lexer_data) here =)
return lexer_data->where_in_src>=lexer_data->src->src_size;
}
struct token* lexer_extract_next_token(struct Lexer_Data *lexer_data)
{
struct token *ret;
struct Automata_Node *hold_node;
size_t where_does_the_token_start_in_the_source_file;
if(lexer_data->buffer_token!=NULL)
{
ret=lexer_data->buffer_token;
lexer_data->buffer_token=NULL;
return ret;
}
lexer_skip_white_space(lexer_data);
where_does_the_token_start_in_the_source_file=lexer_data->where_in_src;
if(lexer_eof(lexer_data))
return NULL;
hold_node=lexer_feed_automata_until_error(lexer_data);
if(hold_node==NULL)
return get_error_token("Unrecognised lexical element",get_source_location(
lexer_data->which_column,
lexer_data->which_row,
where_does_the_token_start_in_the_source_file,
lexer_data->where_in_src-where_does_the_token_start_in_the_source_file,
lexer_data->src
),
lexer_data->previous_token_location,
lexer_data->program);
ret=lexer_make_token_finishing_on_node(lexer_data, hold_node, where_does_the_token_start_in_the_source_file);
lexer_data->is_in_the_begining_of_line=0;
lexer_data->previous_token_location=ret->delta->location;
return ret;
}
struct Automata_Node* lexer_feed_automata_until_error(struct Lexer_Data *lexer_data)
{
struct Automata_Node *head;
struct Automata_Node *follower;
head=&chonky[0];
follower=NULL;
while(head!=NULL)
{
follower=head;
head=lexer_feed_automata_next_char(lexer_data,head);
}
return follower;
}
struct Automata_Node *lexer_feed_automata_next_char(struct Lexer_Data *lexer_data,struct Automata_Node *node)
{
size_t hold_where_in_src;
size_t hold_which_column;
size_t hold_which_row;
struct Automata_Node *ret;
enum Source_Chars ch;
hold_where_in_src=lexer_data->where_in_src;
hold_which_column=lexer_data->which_column;
hold_which_row=lexer_data->which_row;
ch=lexer_get_ch_accounting_for_linesplices_and_comments(lexer_data->program,lexer_data->src->src,
lexer_data->src->src_size,
&lexer_data->where_in_src,
&lexer_data->which_row,
&lexer_data->which_column);
if(node->delta[ch]==id_node)
{
ret=get_new_id_node(node,ch);
}else
{
ret=node->delta[ch];
}
if(ret==NULL)
{
lexer_data->where_in_src=hold_where_in_src;
lexer_data->which_column=hold_which_column;
lexer_data->which_row=hold_which_row;
return NULL;
}else
{
return ret;
}
}
struct token *lexer_make_token_finishing_on_node(struct Lexer_Data *lexer_data,struct Automata_Node *finishing_node,size_t start_position)
{
struct Source_Location *token_location;
wonky_assert(lexer_data->where_in_src > start_position);
wonky_assert(is_valid_automata_node(finishing_node));
token_location=get_source_location(
lexer_data->which_row,
lexer_data->which_column,
start_position,
lexer_data->where_in_src-start_position,
lexer_data->src
);
switch(finishing_node->keyword)
{
case KW_HASHTAG_HASHTAG:
if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
{
return get_small_token(KW_HASHTAG_HASHTAG,token_location,lexer_data->previous_token_location);
}else
{
return get_error_token("Ran into ## while not in a preprocessing directive. Invalid syntax.",token_location,lexer_data->previous_token_location,lexer_data->program);
}
break;
case KW_HASHTAG:
if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
{
return get_small_token(KW_HASHTAG,token_location,lexer_data->previous_token_location);
}else
{
if(!lexer_data->is_in_the_begining_of_line)
{
return get_error_token("# is not in the begining of a logical line",token_location,lexer_data->previous_token_location,lexer_data->program);
}else
{
return lex_preprocessing_directive(lexer_data,token_location);
}
}
break;
case KW_ID:
if(finishing_node->data==NULL)
{
struct identifier *id;
id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
finishing_node->data=id;
}
return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
break;
case KW_AUTO:
case KW_DO:
case KW_DOUBLE:
case KW_INT:
case KW_STRUCT:
case KW_BREAK:
case KW_ELSE:
case KW_LONG:
case KW_SWITCH:
case KW_CASE:
case KW_ENUM:
case KW_REGISTER:
case KW_TYPEDEF:
case KW_CHAR:
case KW_EXTERN:
case KW_RETURN:
case KW_UNION:
case KW_CONST:
case KW_FLOAT:
case KW_SHORT:
case KW_UNSIGNED:
case KW_CONTINUE:
case KW_FOR:
case KW_SIGNED:
case KW_VOID:
case KW_DEFAULT:
case KW_GOTO:
case KW_SIZEOF:
case KW_VOLATILE:
case KW_IF:
case KW_STATIC:
case KW_WHILE:
case KW_INLINE:
case KW_RESTRICT:
case KW_BOOL:
case KW_COMPLEX:
case KW_IMAGINARY:
if(finishing_node->data==NULL)
{
struct identifier *id;
id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
finishing_node->data=id;
}
return get_keyword_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,finishing_node->data);
case KW_EXCLAMATION:
case KW_PERCENT:
case KW_AND:
case KW_AND_AND:
case KW_OPEN_NORMAL:
case KW_CLOSE_NORMAL:
case KW_STAR:
case KW_PLUS:
case KW_COMMA:
case KW_MINUS:
case KW_DOT:
case KW_ARROW:
case KW_COLUMN:
case KW_SEMICOLON:
case KW_LESS:
case KW_EQ:
case KW_EQEQ:
case KW_MORE:
case KW_QUESTION:
case KW_HAT:
case KW_PIPE:
case KW_PIPE_PIPE:
case KW_TILDE:
case KW_PLUSPLUS:
case KW_MINUSMINUS:
case KW_SHIFT_RIGHT:
case KW_SHIFT_LEFT:
case KW_LESS_EQ:
case KW_MORE_EQ:
case KW_NOT_EQ:
case KW_PLUS_EQ:
case KW_MINUS_EQ:
case KW_STAR_EQ:
case KW_PERCENT_EQ:
case KW_SHIFT_LEFT_EQ:
case KW_SHIFT_RIGHT_EQ:
case KW_AND_EQ:
case KW_HAT_EQ:
case KW_PIPE_EQ:
case KW_ELIPSIS:
case KW_DIV:
case KW_OPEN_SQUARE:
case KW_CLOSE_SQUARE:
case KW_CLOSE_CURLY:
case KW_OPEN_CURLY:
case KW_DIV_EQ:
case KW_FORWARD_SLASH:
return get_punctuator_token(finishing_node->keyword,token_location,lexer_data->previous_token_location);
case KW_HEXADECIMAL_CONSTANT:
case KW_DECIMAL_CONSTANT:
case KW_OCTAL_CONSTANT:
case KW_UNSIGNED_DECIMAL_CONSTANT:
case KW_UNSIGNED_OCTAL_CONSTANT:
case KW_UNSIGNED_HEXADECIMAL_CONSTANT:
case KW_UNSIGNED_LONG_HEXADECIMAL_CONSTANT:
case KW_UNSIGNED_LONG_OCTAL_CONSTANT:
case KW_UNSIGNED_LONG_DECIMAL_CONSTANT:
case KW_UNSIGNED_LONG_LONG_DECIMAL_CONSTANT:
case KW_UNSIGNED_LONG_LONG_HEXADECIMAL_CONSTANT:
case KW_UNSIGNED_LONG_LONG_OCTAL_CONSTANT:
case KW_LONG_HEXADECIMAL_CONSTANT:
case KW_LONG_OCTAL_CONSTANT:
case KW_LONG_DECIMAL_CONSTANT:
case KW_LONG_LONG_HEXADECIMAL_CONSTANT:
case KW_LONG_LONG_OCTAL_CONSTANT:
case KW_LONG_LONG_DECIMAL_CONSTANT:
case KW_DOUBLE_DECIMAL_CONSTANT:
case KW_LONG_DOUBLE_DECIMAL_CONSTANT:
case KW_FLOAT_DECIMAL_CONSTANT:
case KW_DOUBLE_HEXADECIMAL_CONSTANT:
case KW_LONG_DOUBLE_HEXADECIMAL_CONSTANT:
case KW_FLOAT_HEXADECIMAL_CONSTANT:
case KW_CHAR_CONSTANT:
case KW_WIDE_CHAR_CONSTANT:
return get_constant_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
case KW_STRING:
case KW_WIDE_STRING:
return get_string_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,lexer_data->src->src+start_position+1,lexer_data->where_in_src-start_position-2);
case PKW_FILE_MACRO:
return get_file_macro_token(token_location,lexer_data->previous_token_location);
case PKW_DATE_MACRO:
return get_date_macro_token(token_location,lexer_data->previous_token_location);
case PKW_LINE_MACRO:
return get_line_macro_token(token_location,lexer_data->previous_token_location);
case PKW_STDC_MACRO:
return get_stdc_macro_token(token_location,lexer_data->previous_token_location);
case PKW_STDC_HOSTED_MACRO:
return get_stdc_hosted_macro_token(token_location,lexer_data->previous_token_location);
case PKW_STDC_VERSION_MACRO:
return get_stdc_version_macro_token(token_location,lexer_data->previous_token_location);
case PKW_TIME_MACRO:
return get_time_macro_token(token_location,lexer_data->previous_token_location);
case PKW_IF:
case PKW_IFDEF:
case PKW_IFNDEF:
case PKW_ELIF:
case PKW_ELSE:
case PKW_ENDIF:
case PKW_INCLUDE:
case PKW_DEFINE:
case PKW_UNDEF:
case PKW_LINE:
case PKW_ERROR:
case PKW_PRAGMA:
if(finishing_node->data==NULL)
{
struct identifier *id;
id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
finishing_node->data=id;
}
return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
break;
case PKW_DEFINED:
if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
{
return lex_defined_unary_operator(lexer_data,token_location);
}else
{
if(finishing_node->data==NULL)
{
struct identifier *id;
id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
finishing_node->data=id;
}
return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
}
default:
return get_error_token("Unexpected token",token_location,lexer_data->previous_token_location,lexer_data->program);
}
wonky_assert(SHOULD_NOT_REACH_HERE);
}
struct token *lex_defined_unary_operator(struct Lexer_Data *lexer_data,struct Source_Location *where)
{
return preprocessing_lex_defined_unary_operator(lexer_data,where);
}
struct token *lex_preprocessing_directive(struct Lexer_Data *lexer_data,struct Source_Location *where)
{
if(lexer_eof(lexer_data))
return get_eof_token();
else
return preprocessing_lex_directive(lexer_data,where);
}
_Bool lexer_get_and_check(struct Lexer_Data *lexer_data,enum LEXER_TYPE token_type)
{
if(lexer_check(lexer_data,token_type))
{
delete_token(lexer_data->buffer_token);
return 1;
}else
{
return 0;
}
}
_Bool lexer_check(struct Lexer_Data *lexer_data,enum LEXER_TYPE token_type)
{
if(lexer_data->buffer_token==NULL)
lexer_data->buffer_token=lexer_extract_next_token(lexer_data);
if(lexer_data->buffer_token && lexer_data->buffer_token->type==token_type)
{
return 1;
}else
{
return 0;
}
}
enum Source_Chars lexer_get_ch_accounting_for_linesplices(struct Program *program,const char *src,size_t src_size,size_t *where_in_src,size_t *which_line,size_t *which_column)
{
enum {
UNKNOWN_CHAR,
START_OF_POSSIBLE_LINE_SPLICE,
KNOWN_CHAR
} state = UNKNOWN_CHAR;
enum Source_Chars ch;
while(state!=KNOWN_CHAR && *where_in_src<src_size)
{
ch=get_ch(src+*where_in_src,src_size-*where_in_src);
switch(state)
{
case UNKNOWN_CHAR:
if(ch==CHAR_BACKWARD_SLASH)
{
state=START_OF_POSSIBLE_LINE_SPLICE;
++*where_in_src;
++*which_column;
}else
{
state=KNOWN_CHAR;
}
break;
case START_OF_POSSIBLE_LINE_SPLICE:
if(ch==CHAR_FORM_FEED_TAB)
{
state=UNKNOWN_CHAR;
++*where_in_src;
*which_column=0;
++*which_line;
}else
{
state=KNOWN_CHAR;
}
break;
default:
wonky_assert(SHOULD_NOT_REACH_HERE);
}
}
if(*where_in_src<src_size && state==KNOWN_CHAR)
{
++*where_in_src;
if(ch==CHAR_FORM_FEED_TAB)
{
++*which_line;
*which_column=0;
}else
{
++*which_column;
}
return ch;
}else if(state==START_OF_POSSIBLE_LINE_SPLICE)
{
push_generic_error(program,"Can't linesplice into an end of file");
return CHAR_SPACE;
}else
{
return CHAR_SPACE;
}
}
enum Source_Chars lexer_get_ch_accounting_for_linesplices_and_comments(struct Program *program,const char *src,size_t src_size,size_t *where_in_src,size_t *which_line,size_t *which_column)
{
enum Source_Chars ch;
ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,where_in_src,which_line,which_column);
if(ch==CHAR_FORWARD_SLASH)
{
size_t wh_in_src=*where_in_src;
size_t wh_line=*which_line;
size_t wh_col=*which_column;
ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,&wh_in_src,&wh_line,&wh_col);
if(ch==CHAR_FORWARD_SLASH)
{
while(
lexer_get_ch_accounting_for_linesplices(
program,src,src_size,&wh_in_src,&wh_line,&wh_col)!=CHAR_FORM_FEED_TAB
&&
wh_in_src<src_size
);
*where_in_src=wh_in_src;
*which_line=wh_line;
*which_column=wh_col;
return CHAR_FORM_FEED_TAB;
}else if(ch==CHAR_STAR)
{
enum {
START,
STAR,
END
} state = START;
while(state!=END && wh_in_src<src_size)
{
ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,&wh_in_src,&wh_line,&wh_col);
if(state==START && ch==CHAR_STAR)
state=STAR;
else if(state==STAR && ch==CHAR_FORWARD_SLASH)
state=END;
else
state=START;
}
*where_in_src=wh_in_src;
*which_line=wh_line;
*which_column=wh_col;
return CHAR_SPACE;
}else
{
return CHAR_FORWARD_SLASH;
}
}else
{
return ch;
}
}
void delete_lexer_data(struct Lexer_Data *lexer_data)
{
wonky_free(lexer_data);
}
#endif