WONKY

LOG | FILES | OVERVIEW
#ifndef WONKY_LEXER_C
#define WONKY_LEXER_C WONKY_LEXER_C
/*asdf*/#include <lexer.h>


struct Preprocessing_Translation_Unit* lex(struct Source_File *src,struct Program *program)
{
	struct Lexer_Data *lexer_data;
	struct Preprocessing_Translation_Unit *lexed_unit;

	lexer_data=get_lexer_data(src,program);

	lexed_unit=lex_inner(lexer_data);	

		

	Map_Push(
			program->preprocessing_translation_units,
			src->src_name->name,
			src->src_name->name_size,
			lexed_unit
		);

	

	delete_lexer_data(lexer_data);

	if(lexed_unit->tokens->size==0)
	{
		push_generic_error(program,"Empty translation unit");
		return NULL;
	}
	return lexed_unit;
}


struct Preprocessing_Translation_Unit* lex_inner(struct Lexer_Data *lexer_data)
{
	struct Preprocessing_Translation_Unit *unit;
	struct token *token;

	unit=get_preprocessing_translation_unit(lexer_data->src);

	while(!lexer_eof(lexer_data))
	{
		token=lexer_extract_next_token(lexer_data);
		if(token!=NULL)
			push_token_into_preprocessing_translation_unit(unit,token);
	}

	return unit;
}


struct Lexer_Data* get_lexer_data(struct Source_File *src,struct Program *program)
{
	struct Lexer_Data *ret;

	ret=wonky_malloc(sizeof(struct Lexer_Data));
	ret->where_in_src=0;
	ret->which_column=0;
	ret->which_row=0;
	ret->is_in_the_begining_of_line=1;
	ret->is_in_if_directive_body=0;
	ret->automata_view=AUTOMATA_VIEW_NORMAL;
	ret->src=src;
	ret->program=program;
	ret->previous_token_location=&start_of_file;
	ret->buffer_token=NULL;

	return ret;

}

struct token* lex_token_from_string(char *str,size_t string_length)
{
	struct Automata_Node *head;
	struct Automata_Node *follower;
	struct token *ret;

	wonky_assert(str && string_length);

	head=&chonky[0];
	follower=NULL;
}

void lexer_skip_white_space(struct Lexer_Data *lexer_data)
{
	size_t where_in_src=lexer_data->where_in_src;
	size_t which_line=lexer_data->which_row;
	size_t which_column=lexer_data->which_column;
	enum Source_Chars ch;

	for(ch=CHAR_SPACE;
	    (ch==CHAR_SPACE || ch==CHAR_VERTICAL_TAB || ch==CHAR_HORISONTAL_TAB || ch==CHAR_FORM_FEED_TAB) && !lexer_eof(lexer_data);
	    ch=lexer_get_ch_accounting_for_linesplices_and_comments(
		    		lexer_data->program,
				lexer_data->src->src,
				lexer_data->src->src_size,
				&where_in_src,
				&which_line,
				&which_column)
	   )
	{

		if(ch==CHAR_FORM_FEED_TAB)
			lexer_data->is_in_the_begining_of_line=1;
		lexer_data->where_in_src=where_in_src;
		lexer_data->which_row=which_line;
		lexer_data->which_column=which_column;
	}


}
inline _Bool lexer_eof(struct Lexer_Data *lexer_data)
{
#warning might want a lexer_skip_white_space(lexer_data) here =)
	return lexer_data->where_in_src>=lexer_data->src->src_size;
}
struct token* lexer_extract_next_token(struct Lexer_Data *lexer_data)
{
	struct token *ret;
	struct Automata_Node *hold_node;
	size_t where_does_the_token_start_in_the_source_file;

	if(lexer_data->buffer_token!=NULL)
	{
		ret=lexer_data->buffer_token;
		lexer_data->buffer_token=NULL;
		return ret;
	}

		lexer_skip_white_space(lexer_data);
		where_does_the_token_start_in_the_source_file=lexer_data->where_in_src;

		if(lexer_eof(lexer_data))
			return NULL;

		hold_node=lexer_feed_automata_until_error(lexer_data);


		if(hold_node==NULL)
			return get_error_token("Unrecognised lexical element",get_source_location(
												lexer_data->which_column,
												lexer_data->which_row,
												where_does_the_token_start_in_the_source_file,
												lexer_data->where_in_src-where_does_the_token_start_in_the_source_file,
												lexer_data->src
												),
						lexer_data->previous_token_location,
						lexer_data->program);
	ret=lexer_make_token_finishing_on_node(lexer_data, hold_node, where_does_the_token_start_in_the_source_file);
	lexer_data->is_in_the_begining_of_line=0;
	lexer_data->previous_token_location=ret->delta->location;
	return ret;
}

struct Automata_Node* lexer_feed_automata_until_error(struct Lexer_Data *lexer_data)
{
	struct Automata_Node *head;
	struct Automata_Node *follower;

	head=&chonky[0];
	follower=NULL;

	while(head!=NULL)
	{
		follower=head;
		head=lexer_feed_automata_next_char(lexer_data,head);
	}

	return follower;
}

struct Automata_Node *lexer_feed_automata_next_char(struct Lexer_Data *lexer_data,struct Automata_Node *node)
{
	size_t hold_where_in_src;
	size_t hold_which_column;
	size_t hold_which_row;
	struct Automata_Node *ret;
	enum Source_Chars ch;

	hold_where_in_src=lexer_data->where_in_src;
	hold_which_column=lexer_data->which_column;
	hold_which_row=lexer_data->which_row;


	ch=lexer_get_ch_accounting_for_linesplices_and_comments(lexer_data->program,lexer_data->src->src,
										    lexer_data->src->src_size,
										    &lexer_data->where_in_src,
										    &lexer_data->which_row,
										    &lexer_data->which_column);

	if(node->delta[ch]==id_node)
	{
		ret=get_new_id_node(node,ch);
	}else
	{
		ret=node->delta[ch];
	}
	if(ret==NULL)
	{
		lexer_data->where_in_src=hold_where_in_src;
		lexer_data->which_column=hold_which_column;
		lexer_data->which_row=hold_which_row;

		return NULL;
	}else
	{
		return ret;
	}
}

struct token *lexer_make_token_finishing_on_node(struct Lexer_Data *lexer_data,struct Automata_Node *finishing_node,size_t start_position)
{
	struct Source_Location *token_location;

	wonky_assert(lexer_data->where_in_src > start_position);
	wonky_assert(is_valid_automata_node(finishing_node));

	token_location=get_source_location(
					lexer_data->which_row,
					lexer_data->which_column,
					start_position,
					lexer_data->where_in_src-start_position,
					lexer_data->src
					);


	switch(finishing_node->keyword)
	{
		case KW_HASHTAG_HASHTAG:
			if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
			{
				return get_small_token(KW_HASHTAG_HASHTAG,token_location,lexer_data->previous_token_location);
			}else
			{
				return get_error_token("Ran into ## while not in a preprocessing directive. Invalid syntax.",token_location,lexer_data->previous_token_location,lexer_data->program);
			}
			break;
		case KW_HASHTAG:
			if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
			{
				return get_small_token(KW_HASHTAG,token_location,lexer_data->previous_token_location);
			}else
			{
				if(!lexer_data->is_in_the_begining_of_line)
				{
					return get_error_token("# is not in the begining of a logical line",token_location,lexer_data->previous_token_location,lexer_data->program);
				}else
				{
					return lex_preprocessing_directive(lexer_data,token_location);
				}
			}
			break;
		case KW_ID:
			if(finishing_node->data==NULL)
			{
				struct identifier *id;

				id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
				finishing_node->data=id;
			}
			return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
			break;
		case KW_AUTO:
		case KW_DO:
		case KW_DOUBLE:
		case KW_INT:
		case KW_STRUCT:
		case KW_BREAK:
		case KW_ELSE:
		case KW_LONG:
		case KW_SWITCH:
		case KW_CASE:
		case KW_ENUM:
		case KW_REGISTER:
		case KW_TYPEDEF:
		case KW_CHAR:
		case KW_EXTERN:
		case KW_RETURN:
		case KW_UNION:
		case KW_CONST:
		case KW_FLOAT:
		case KW_SHORT:
		case KW_UNSIGNED:
		case KW_CONTINUE:
		case KW_FOR:
		case KW_SIGNED:
		case KW_VOID:
		case KW_DEFAULT:
		case KW_GOTO:
		case KW_SIZEOF:
		case KW_VOLATILE:
		case KW_IF:
		case KW_STATIC:
		case KW_WHILE:
		case KW_INLINE:
		case KW_RESTRICT:
		case KW_BOOL:
		case KW_COMPLEX:
		case KW_IMAGINARY:
			if(finishing_node->data==NULL)
			{
				struct identifier *id;

				id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
				finishing_node->data=id;
			}
			return get_keyword_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,finishing_node->data);

		case KW_EXCLAMATION:
		case KW_PERCENT:
		case KW_AND:
		case KW_AND_AND:
		case KW_OPEN_NORMAL:
		case KW_CLOSE_NORMAL:
		case KW_STAR:
		case KW_PLUS:
		case KW_COMMA:
		case KW_MINUS:
		case KW_DOT:
		case KW_ARROW:
		case KW_COLUMN:
		case KW_SEMICOLON:
		case KW_LESS:
		case KW_EQ:
		case KW_EQEQ:
		case KW_MORE:
		case KW_QUESTION:
		case KW_HAT:
		case KW_PIPE:
		case KW_PIPE_PIPE:
		case KW_TILDE:
		case KW_PLUSPLUS:
		case KW_MINUSMINUS:
		case KW_SHIFT_RIGHT:
		case KW_SHIFT_LEFT:
		case KW_LESS_EQ:
		case KW_MORE_EQ:
		case KW_NOT_EQ:
		case KW_PLUS_EQ:
		case KW_MINUS_EQ:
		case KW_STAR_EQ:
		case KW_PERCENT_EQ:
		case KW_SHIFT_LEFT_EQ:
		case KW_SHIFT_RIGHT_EQ:
		case KW_AND_EQ:
		case KW_HAT_EQ:
		case KW_PIPE_EQ:
		case KW_ELIPSIS:
		case KW_DIV:
		case KW_OPEN_SQUARE:
		case KW_CLOSE_SQUARE:
		case KW_CLOSE_CURLY:
		case KW_OPEN_CURLY:
		case KW_DIV_EQ:
		case KW_FORWARD_SLASH:
			return get_punctuator_token(finishing_node->keyword,token_location,lexer_data->previous_token_location);

		case KW_HEXADECIMAL_CONSTANT:
		case KW_DECIMAL_CONSTANT:
		case KW_OCTAL_CONSTANT:
		case KW_UNSIGNED_DECIMAL_CONSTANT:
		case KW_UNSIGNED_OCTAL_CONSTANT:
		case KW_UNSIGNED_HEXADECIMAL_CONSTANT:
		case KW_UNSIGNED_LONG_HEXADECIMAL_CONSTANT:
		case KW_UNSIGNED_LONG_OCTAL_CONSTANT:
		case KW_UNSIGNED_LONG_DECIMAL_CONSTANT:
		case KW_UNSIGNED_LONG_LONG_DECIMAL_CONSTANT:
		case KW_UNSIGNED_LONG_LONG_HEXADECIMAL_CONSTANT:
		case KW_UNSIGNED_LONG_LONG_OCTAL_CONSTANT:
		case KW_LONG_HEXADECIMAL_CONSTANT:
		case KW_LONG_OCTAL_CONSTANT:
		case KW_LONG_DECIMAL_CONSTANT:
		case KW_LONG_LONG_HEXADECIMAL_CONSTANT:
		case KW_LONG_LONG_OCTAL_CONSTANT:
		case KW_LONG_LONG_DECIMAL_CONSTANT:
		case KW_DOUBLE_DECIMAL_CONSTANT:
		case KW_LONG_DOUBLE_DECIMAL_CONSTANT:
		case KW_FLOAT_DECIMAL_CONSTANT:
		case KW_DOUBLE_HEXADECIMAL_CONSTANT:
		case KW_LONG_DOUBLE_HEXADECIMAL_CONSTANT:
		case KW_FLOAT_HEXADECIMAL_CONSTANT:
		case KW_CHAR_CONSTANT:
		case KW_WIDE_CHAR_CONSTANT:
			return get_constant_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);

		case KW_STRING:
		case KW_WIDE_STRING:
			return get_string_token(finishing_node->keyword,token_location,lexer_data->previous_token_location,lexer_data->src->src+start_position+1,lexer_data->where_in_src-start_position-2);
		case PKW_FILE_MACRO:
			return get_file_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_DATE_MACRO:
			return get_date_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_LINE_MACRO:
			return get_line_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_STDC_MACRO:
			return get_stdc_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_STDC_HOSTED_MACRO:
			return get_stdc_hosted_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_STDC_VERSION_MACRO:
			return get_stdc_version_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_TIME_MACRO:
			return get_time_macro_token(token_location,lexer_data->previous_token_location);
		case PKW_IF:
		case PKW_IFDEF:
		case PKW_IFNDEF:
		case PKW_ELIF:
		case PKW_ELSE:
		case PKW_ENDIF:
		case PKW_INCLUDE:
		case PKW_DEFINE:
		case PKW_UNDEF:
		case PKW_LINE:
		case PKW_ERROR:
		case PKW_PRAGMA:
			if(finishing_node->data==NULL)
			{
				struct identifier *id;

				id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
				finishing_node->data=id;
			}
			return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
			break;
		case PKW_DEFINED:
			if(lexer_data->automata_view==AUTOMATA_VIEW_PREPROCESSING_DIRECTIVE)
			{
				return lex_defined_unary_operator(lexer_data,token_location);
			}else
			{
				if(finishing_node->data==NULL)
				{
					struct identifier *id;

					id=get_identifier(lexer_data->src->src+start_position,lexer_data->where_in_src-start_position);
					finishing_node->data=id;
				}
				return get_id_token(finishing_node->data,token_location,lexer_data->previous_token_location);
			}
		default:
			return get_error_token("Unexpected token",token_location,lexer_data->previous_token_location,lexer_data->program);
	}

	wonky_assert(SHOULD_NOT_REACH_HERE);
}
struct token *lex_defined_unary_operator(struct Lexer_Data *lexer_data,struct Source_Location *where)
{
	return preprocessing_lex_defined_unary_operator(lexer_data,where);
}
struct token *lex_preprocessing_directive(struct Lexer_Data *lexer_data,struct Source_Location *where)
{
	if(lexer_eof(lexer_data))
		return get_eof_token();
	else 
		return preprocessing_lex_directive(lexer_data,where);

}
_Bool lexer_get_and_check(struct Lexer_Data *lexer_data,enum LEXER_TYPE token_type)
{
	if(lexer_check(lexer_data,token_type))
	{
		delete_token(lexer_data->buffer_token);
		return 1;
	}else
	{
		return 0;
	}
}
_Bool lexer_check(struct Lexer_Data *lexer_data,enum LEXER_TYPE token_type)
{
	if(lexer_data->buffer_token==NULL)
		lexer_data->buffer_token=lexer_extract_next_token(lexer_data);

	if(lexer_data->buffer_token && lexer_data->buffer_token->type==token_type)
	{
		return 1;
	}else
	{
		return 0;
	}
}
enum Source_Chars lexer_get_ch_accounting_for_linesplices(struct Program *program,const char *src,size_t src_size,size_t *where_in_src,size_t *which_line,size_t *which_column)
{
	enum {
		UNKNOWN_CHAR,
		START_OF_POSSIBLE_LINE_SPLICE,
		KNOWN_CHAR
	} state = UNKNOWN_CHAR;
	enum Source_Chars ch;

	while(state!=KNOWN_CHAR && *where_in_src<src_size)
	{
		ch=get_ch(src+*where_in_src,src_size-*where_in_src);
		switch(state)
		{
			case UNKNOWN_CHAR:
				if(ch==CHAR_BACKWARD_SLASH)
				{
					state=START_OF_POSSIBLE_LINE_SPLICE;
					++*where_in_src;
					++*which_column;
				}else
				{
					state=KNOWN_CHAR;	
				}
				break;
			case START_OF_POSSIBLE_LINE_SPLICE:
				if(ch==CHAR_FORM_FEED_TAB)
				{
					state=UNKNOWN_CHAR;
					++*where_in_src;
					*which_column=0;
					++*which_line;
				}else
				{
					state=KNOWN_CHAR;	
				}
				break;
			default:
				wonky_assert(SHOULD_NOT_REACH_HERE);
		}
	}

	if(*where_in_src<src_size && state==KNOWN_CHAR)
	{
		++*where_in_src;
		if(ch==CHAR_FORM_FEED_TAB)
		{
			++*which_line;
			*which_column=0;
		}else
		{
			++*which_column;
		}
		return ch;
	}else if(state==START_OF_POSSIBLE_LINE_SPLICE)
	{
		push_generic_error(program,"Can't linesplice into an end of file");
		return CHAR_SPACE;
	}else
	{
		return CHAR_SPACE;
	}

}
enum Source_Chars lexer_get_ch_accounting_for_linesplices_and_comments(struct Program *program,const char *src,size_t src_size,size_t *where_in_src,size_t *which_line,size_t *which_column)
{
	enum Source_Chars ch;

	ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,where_in_src,which_line,which_column);
	if(ch==CHAR_FORWARD_SLASH)
	{
		size_t wh_in_src=*where_in_src;
		size_t wh_line=*which_line;
		size_t wh_col=*which_column;

		ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,&wh_in_src,&wh_line,&wh_col);

		if(ch==CHAR_FORWARD_SLASH)
		{
			while(
			  lexer_get_ch_accounting_for_linesplices(
				  program,src,src_size,&wh_in_src,&wh_line,&wh_col)!=CHAR_FORM_FEED_TAB
			  &&
			  wh_in_src<src_size
			  );

			*where_in_src=wh_in_src;
			*which_line=wh_line;
			*which_column=wh_col;
			return CHAR_FORM_FEED_TAB;

		}else if(ch==CHAR_STAR)
		{
			enum {
				START,
				STAR,
				END
			} state = START;

			while(state!=END && wh_in_src<src_size)
			{
				ch=lexer_get_ch_accounting_for_linesplices(program,src,src_size,&wh_in_src,&wh_line,&wh_col);
				if(state==START && ch==CHAR_STAR)
					state=STAR;
				else if(state==STAR && ch==CHAR_FORWARD_SLASH)
					state=END;
				else
					state=START;
			}
			*where_in_src=wh_in_src;
			*which_line=wh_line;
			*which_column=wh_col;
			return CHAR_SPACE;
		}else
		{
			return CHAR_FORWARD_SLASH;
		}
	}else
	{
		return ch;
	}


}
void delete_lexer_data(struct Lexer_Data *lexer_data)
{
	wonky_free(lexer_data);

}
#endif