Hi,
hckr83 wrote:Making the tokenizer is what I can't do...I have tried like 10 times to make something convert a plain text file in a language convert to tokens...it just doesn't work for me...
A quick crash course....
First you need some output routines which would send the tokenized data somewhere. For now just send the tokens to the screen and worry about the details later:
Code: Select all
void sendSimpleToken(unsigned short token) {
printf("%d\n", token);
}
void sendStringToken(unsigned short token, char *string) {
printf("%d '%s'\n", token, string);
}
void sendNumberToken(unsigned short token, long int number) {
printf("%d %d\n", token, number);
}
Next you want some framework for your tokenizer. I'm going to assume it's a line-oriented language (like assembly) and your tokenizer is sent lines of ASCII by the code that reads the source file/s from disk:
Code: Select all
void tokenize(char *input) {
while(*input != 0) {
printf("Syntax error!\n");
}
}
Now you probably want to ignore any white space between things. That's easy enough:
Code: Select all
void tokenize(char *input) {
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
}
printf("Syntax error!\n");
}
}
Every language has numerical constants, so let's add them next:
Code: Select all
enum {
TOKEN_NUMBER,
}
void tokenize(char *input) {
long int number;
char *temp;
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
} else {
number = strtol(input, &temp, 0);
if(*temp != input) {
sendNumberToken(TOKEN_NUMBER, number);
}
}
printf("Syntax error!\n");
}
}
Then there's identifiers (labels, keywords, etc):
Code: Select all
enum {
TOKEN_NUMBER,
TOKEN_IDENTIFIER,
}
void tokenize(char *input) {
long int number;
char *temp;
char identifier[256];
int i;
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
} else {
number = strtol(input, &temp, 0);
if(temp != input) {
sendNumberToken(TOKEN_NUMBER, number);
input = temp;
} else {
i = 0;
while( (input[i] > 'A') && (input[i] < 'Z') ) {
identifier[i] = input[i];
i++;
}
if(i != 0) {
identifier[i] = 0;
sendStringToken(TOKEN_IDENTIFIER, identifier);
input += i;
}
}
}
printf("Syntax error!\n");
}
}
You'd probably also want to handle operators, punctuation, etc. I'll only do a few:
Code: Select all
enum {
TOKEN_NUMBER,
TOKEN_IDENTIFIER,
TOKEN_LEFTBRACKET,
TOKEN_RIGHTBRACKET,
TOKEN_MULTIPLY,
TOKEN_ADD,
}
void tokenize(char *input) {
long int number;
char *temp;
char identifier[256];
int i;
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
} else if(*input == '*') {
sendSimpleToken(TOKEN_MULTIPLY);
} else if(*input == '+') {
sendSimpleToken(TOKEN_ADD);
} else if(*input == '(') {
sendSimpleToken(TOKEN_LEFTBRACKET);
} else if(*input == ')') {
sendSimpleToken(TOKEN_RIGHTBRACKET);
} else {
number = strtol(input, &temp, 0);
if(temp != input) {
sendNumberToken(TOKEN_NUMBER, number);
input = temp;
} else {
i = 0;
while( (input[i] > 'A') && (input[i] < 'Z') ) {
identifier[i] = input[i];
i++;
}
if(i != 0) {
identifier[i] = 0;
sendStringToken(TOKEN_IDENTIFIER, identifier);
input += i;
}
}
}
printf("Syntax error!\n");
}
}
Handling string constants would be a good idea. Also if your language has comments like assembly you could probably stop tokenizing when you get a comment character:
Code: Select all
enum {
TOKEN_NUMBER,
TOKEN_IDENTIFIER,
TOKEN_STRING,
TOKEN_LEFTBRACKET,
TOKEN_RIGHTBRACKET,
TOKEN_MULTIPLY,
TOKEN_ADD,
}
void tokenize(char *input) {
long int number;
char *temp;
char identifier[256];
int i;
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
} else if(*input == ';') {
return;
} else if(*input == '*') {
sendSimpleToken(TOKEN_MULTIPLY);
} else if(*input == '+') {
sendSimpleToken(TOKEN_ADD);
} else if(*input == '(') {
sendSimpleToken(TOKEN_LEFTBRACKET);
} else if(*input == ')') {
sendSimpleToken(TOKEN_RIGHTBRACKET);
} else if(*input == '"') {
i = 1;
while( (input[i] != '"') && (input[i] != 0) {
i++;
}
if(input[i] == 0) {
printf("Hey - there's no right quote character on that string!\n");
exit(EXIT_FAILURE);
}
input[i] = 0;
sendStringToken(TOKEN_STRING, input);
input += i + 1;
} else {
number = strtol(input, &temp, 0);
if(temp != input) {
sendNumberToken(TOKEN_NUMBER, number);
input = temp;
} else {
i = 0;
while( (input[i] > 'A') && (input[i] < 'Z') ) {
identifier[i] = input[i];
i++;
}
if(i != 0) {
identifier[i] = 0;
sendStringToken(TOKEN_IDENTIFIER, identifier);
input += i;
}
}
}
printf("Syntax error!\n");
exit(EXIT_FAILURE);
}
}
Lastly, there's keywords. At the moment a keyword would end up being tokenized as an identifier. If your compiler handles several languages (e.g. C and inline assembly, or NASM and AT&T) it's probably better to leave them for the parser to sort out. Otherwise, you could start adding them:
Code: Select all
enum {
TOKEN_NUMBER,
TOKEN_IDENTIFIER,
TOKEN_STRING,
TOKEN_LEFTBRACKET,
TOKEN_RIGHTBRACKET,
TOKEN_MULTIPLY,
TOKEN_ADD,
TOKEN_FOO_KEYWORD,
TOKEN_BAR_KEYWORD,
}
void tokenize(char *input) {
long int number;
char *temp;
char identifier[256];
int i;
while(*input != 0) {
if( (*input = ' ') || (*input = '\t') || (*input = '\r') || (*input = '\n') ) {
input++;
} else if(*input == ';') {
return;
} else if(*input == '*') {
sendSimpleToken(TOKEN_MULTIPLY);
} else if(*input == '+') {
sendSimpleToken(TOKEN_ADD);
} else if(*input == '(') {
sendSimpleToken(TOKEN_LEFTBRACKET);
} else if(*input == ')') {
sendSimpleToken(TOKEN_RIGHTBRACKET);
} else if(*input == '"') {
i = 1;
while( (input[i] != '"') && (input[i] != 0) {
i++;
}
if(input[i] == 0) {
printf("Hey - there's no right quote character on that string!\n");
exit(EXIT_FAILURE);
}
input[i] = 0;
sendStringToken(TOKEN_STRING, input);
input += i + 1;
} else {
number = strtol(input, &temp, 0);
if(temp != input) {
sendNumberToken(TOKEN_NUMBER, number);
input = temp;
} else {
i = 0;
while( (input[i] > 'A') && (input[i] < 'Z') ) {
identifier[i] = input[i];
i++;
}
if(i != 0) {
identifier[i] = 0;
if(strcasecmp(identifier, "FOO") == 0) {
sendSimpleToken(TOKEN_FOO_KEYWORD);
} else if(strcasecmp(identifier, "BAR") == 0) {
sendSimpleToken(TOKEN_BAR_KEYWORD);
} else {
sendStringToken(TOKEN_IDENTIFIER, identifier);
}
input += i;
}
}
}
printf("Syntax error!\n");
exit(EXIT_FAILURE);
}
}
Most of the code above is fairly dodgy (untested, and intended as an example only), but it should be able to tokenize something like this:
Code: Select all
; A comment
FOO 1 * (3 + 4)
BAR "Hello World"
Of course it'd also accept some gibberish like this:
That's OK - the next stage of your compiler can work out what is gibberish and what isn't.
There's also a lot missing from the example code. For e.g. you'd want to be able to handle larger integers and floating point constants (I store all numerical constants as arbitrary precision floating point). It's also a good idea to use an array of keywords and write a function that checks each entry in the array to see if a string is a keyword or not, instead of using a "strcasecmp" for each possible keyword (it's easier to maintain that way, and means you can use hash tables or something to improve performance later on). You'd also want to add a generic error handling function, so that the compiler will tell you which line in which file it didn't like.
After writing your tokenizer you'd want to replace those output routines. For this I tend to add the output to a buffer, and send the buffer to another thread when it gets full or when there's nothing left to tokenize (but that's just how I do things - most people would just collect it all in memory).
I'd also be tempted to write the reverse - some code to convert the tokens back into ASCII. This just makes it easier to debug your compiler later.
Cheers,
Brendan