blob: e72c56eddc99500b3c0540553906290829349614 [file] [log] [blame]
* This is a really stupid C tokenizer. It doesn't do any include
* files or anything complex at all. That's the preprocessor.
* Copyright (C) 2003 Transmeta Corp.
* 2003 Linus Torvalds
* Licensed under the Open Software License version 1.1
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stddef.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include "lib.h"
#include "allocate.h"
#include "token.h"
#include "symbol.h"
#define EOF (-1)
int input_stream_nr = 0;
struct stream *input_streams;
static int input_streams_allocated;
#define BUFSIZE (8192)
typedef struct {
int fd, offset, size;
int pos, line, nr;
int newline, whitespace;
struct token **tokenlist;
struct token *token;
unsigned char *buffer;
} stream_t;
const char *stream_name(int stream)
if (stream < 0 || stream > input_stream_nr)
return "<bad stream>";
return input_streams[stream].name;
static struct position stream_pos(stream_t *stream)
struct position pos;
pos.type = 0; = stream->nr;
pos.newline = stream->newline;
pos.whitespace = stream->whitespace;
pos.pos = stream->pos;
pos.line = stream->line;
pos.noexpand = 0;
return pos;
const char *show_special(int val)
static char buffer[4];
buffer[0] = val;
buffer[1] = 0;
if (val >= SPECIAL_BASE)
strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
return buffer;
const char *show_ident(const struct ident *ident)
static char buffer[256];
if (!ident)
return "<noident>";
sprintf(buffer, "%.*s", ident->len, ident->name);
return buffer;
static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
if (isprint(c)) {
if (c == escape || c == '\\')
*ptr++ = '\\';
*ptr++ = c;
return ptr;
*ptr++ = '\\';
switch (c) {
case '\n':
*ptr++ = 'n';
return ptr;
case '\t':
*ptr++ = 't';
return ptr;
if (!isdigit(next))
return ptr + sprintf(ptr, "%o", c);
return ptr + sprintf(ptr, "%03o", c);
const char *show_string(const struct string *string)
static char buffer[4 * MAX_STRING + 3];
char *ptr;
int i;
if (!string->length)
return "<bad_string>";
ptr = buffer;
*ptr++ = '"';
for (i = 0; i < string->length-1; i++) {
const char *p = string->data + i;
ptr = charstr(ptr, p[0], '"', p[1]);
*ptr++ = '"';
*ptr = '\0';
return buffer;
const char *show_token(const struct token *token)
static char buffer[256];
if (!token)
return "<no token>";
switch (token_type(token)) {
return "syntax error";
return "end-of-input";
return show_ident(token->ident);
return show_string(token->string);
return token->number;
return show_special(token->special);
case TOKEN_CHAR: {
char *ptr = buffer;
int c = token->character;
*ptr++ = '\'';
ptr = charstr(ptr, c, '\'', 0);
*ptr++ = '\'';
*ptr++ = '\0';
return buffer;
sprintf(buffer, "<beginning of '%s'>", stream_name(token->;
return buffer;
sprintf(buffer, "<end of '%s'>", stream_name(token->;
return buffer;
return "WTF???";
int init_stream(const char *name, int fd, const char **next_path)
int stream = input_stream_nr;
struct stream *current;
if (stream >= input_streams_allocated) {
int newalloc = stream * 4 / 3 + 10;
input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
if (!input_streams)
die("Unable to allocate more streams space");
input_streams_allocated = newalloc;
current = input_streams + stream;
memset(current, 0, sizeof(*current));
current->name = name;
current->fd = fd;
current->next_path = next_path;
current->path = NULL;
current->constant = CONSTANT_FILE_MAYBE;
input_stream_nr = stream+1;
return stream;
static struct token * alloc_token(stream_t *stream)
struct token *token = __alloc_token(0);
token->pos = stream_pos(stream);
return token;
* Argh... That was surprisingly messy - handling '\r' complicates the
* things a _lot_.
static int nextchar_slow(stream_t *stream)
int offset = stream->offset;
int size = stream->size;
int c;
int spliced = 0, had_cr, had_backslash, complain;
had_cr = had_backslash = complain = 0;
if (offset >= size) {
size = read(stream->fd, stream->buffer, BUFSIZE);
if (size <= 0)
goto got_eof;
stream->size = size;
stream->offset = offset = 0;
c = stream->buffer[offset++];
if (had_cr && c != '\n')
complain = 1;
if (c == '\r') {
had_cr = 1;
goto repeat;
if (c == '\n') {
stream->pos = 0;
if (!had_backslash) {
if (c == '\\') {
had_backslash = 1;
goto repeat;
if (c == '\n')
stream->newline = 1;
} else {
if (c == '\n') {
if (complain)
warning(stream_pos(stream), "non-ASCII data stream");
spliced = 1;
goto restart;
c = '\\';
stream->offset = offset;
if (complain)
warning(stream_pos(stream), "non-ASCII data stream");
return c;
if (had_backslash) {
c = '\\';
goto out;
if (stream->pos)
warning(stream_pos(stream), "no newline at end of file");
else if (had_cr)
warning(stream_pos(stream), "non-ASCII data stream");
else if (spliced)
warning(stream_pos(stream), "backslash-newline at end of file");
return EOF;
* We want that as light as possible while covering all normal cases.
* Slow path (including the logics with line-splicing and EOF sanity
* checks) is in nextchar_slow().
static inline int nextchar(stream_t *stream)
int offset = stream->offset;
if (offset < stream->size) {
int c = stream->buffer[offset++];
static const char special[256] = {
['\r'] = 1, ['\n'] = 1, ['\\'] = 1
if (!special[c]) {
stream->offset = offset;
return c;
return nextchar_slow(stream);
struct token eof_token_entry;
static void mark_eof(stream_t *stream, struct token *end_token)
struct token *end;
end = alloc_token(stream);
token_type(end) = TOKEN_STREAMEND;
end->pos.newline = 1; = &eof_token_entry;
eof_token_entry.pos.newline = 1;
if (!end_token)
end_token = &eof_token_entry;
end->next = end_token;
*stream->tokenlist = end;
stream->tokenlist = NULL;
static void add_token(stream_t *stream)
struct token *token = stream->token;
stream->token = NULL;
token->next = NULL;
*stream->tokenlist = token;
stream->tokenlist = &token->next;
static void drop_token(stream_t *stream)
stream->newline |= stream->token->pos.newline;
stream->whitespace |= stream->token->pos.whitespace;
stream->token = NULL;
enum {
Letter = 1,
Digit = 2,
Hex = 4,
Exp = 8,
Dot = 16,
ValidSecond = 32,
static const long cclass[257] = {
['0' + 1 ... '9' + 1] = Digit | Hex,
['A' + 1 ... 'D' + 1] = Letter | Hex,
['E' + 1] = Letter | Hex | Exp,
['F' + 1] = Letter | Hex,
['G' + 1 ... 'O' + 1] = Letter,
['P' + 1] = Letter | Exp,
['Q' + 1 ... 'Z' + 1] = Letter,
['a' + 1 ... 'd' + 1] = Letter | Hex,
['e' + 1] = Letter | Hex | Exp,
['f' + 1] = Letter | Hex,
['g' + 1 ... 'o' + 1] = Letter,
['p' + 1] = Letter | Exp,
['q' + 1 ... 'z' + 1] = Letter,
['_' + 1] = Letter,
['.' + 1] = Dot | ValidSecond,
['=' + 1] = ValidSecond,
['+' + 1] = ValidSecond,
['-' + 1] = ValidSecond,
['>' + 1] = ValidSecond,
['<' + 1] = ValidSecond,
['&' + 1] = ValidSecond,
['|' + 1] = ValidSecond,
['#' + 1] = ValidSecond,
* pp-number:
* digit
* . digit
* pp-number digit
* pp-number identifier-nodigit
* pp-number e sign
* pp-number E sign
* pp-number p sign
* pp-number P sign
* pp-number .
static int get_one_number(int c, int next, stream_t *stream)
struct token *token;
static char buffer[4095];
char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
int len;
*p++ = c;
for (;;) {
long class = cclass[next + 1];
if (!(class & (Dot | Digit | Letter)))
if (p != buffer_end)
*p++ = next;
next = nextchar(stream);
if (class & Exp) {
if (next == '-' || next == '+') {
if (p != buffer_end)
*p++ = next;
next = nextchar(stream);
if (p == buffer_end) {
sparse_error(stream_pos(stream), "number token exceeds %td characters",
buffer_end - buffer);
// Pretend we saw just "1".
buffer[0] = '1';
p = buffer + 1;
*p++ = 0;
len = p - buffer;
buf = __alloc_bytes(len);
memcpy(buf, buffer, len);
token = stream->token;
token_type(token) = TOKEN_NUMBER;
token->number = buf;
return next;
static int escapechar(int first, int type, stream_t *stream, int *valp)
int next, value;
next = nextchar(stream);
value = first;
if (first == '\n')
warning(stream_pos(stream), "Newline in string or character constant");
if (first == '\\' && next != EOF) {
value = next;
next = nextchar(stream);
if (value != type) {
switch (value) {
case 'a':
value = '\a';
case 'b':
value = '\b';
case 't':
value = '\t';
case 'n':
value = '\n';
case 'v':
value = '\v';
case 'f':
value = '\f';
case 'r':
value = '\r';
case 'e':
value = '\e';
case '\\':
case '?':
case '\'':
case '"':
case '\n':
warning(stream_pos(stream), "Newline in string or character constant");
case '0'...'7': {
int nr = 2;
value -= '0';
while (next >= '0' && next <= '9') {
value = (value << 3) + (next-'0');
next = nextchar(stream);
if (!--nr)
value &= 0xff;
case 'x': {
int hex = hexval(next);
if (hex < 16) {
value = hex;
next = nextchar(stream);
while ((hex = hexval(next)) < 16) {
value = (value << 4) + hex;
next = nextchar(stream);
value &= 0xff;
/* Fall through */
warning(stream_pos(stream), "Unknown escape '%c'", value);
/* Mark it as escaped */
value |= 0x100;
*valp = value;
return next;
static int get_char_token(int next, stream_t *stream)
int value;
struct token *token;
next = escapechar(next, '\'', stream, &value);
if (value == '\'' || next != '\'') {
sparse_error(stream_pos(stream), "Bad character constant");
return next;
token = stream->token;
token_type(token) = TOKEN_CHAR;
token->character = value & 0xff;
return nextchar(stream);
static int get_string_token(int next, stream_t *stream)
static char buffer[MAX_STRING];
struct string *string;
struct token *token;
int len = 0;
for (;;) {
int val;
next = escapechar(next, '"', stream, &val);
if (val == '"')
if (next == EOF) {
warning(stream_pos(stream), "End of file in middle of string");
return next;
if (len < MAX_STRING)
buffer[len] = val;
if (len > MAX_STRING) {
warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
string = __alloc_string(len+1);
memcpy(string->data, buffer, len);
string->data[len] = '\0';
string->length = len+1;
/* Pass it on.. */
token = stream->token;
token_type(token) = TOKEN_STRING;
token->string = string;
return next;
static int drop_stream_eoln(stream_t *stream)
for (;;) {
switch (nextchar(stream)) {
case EOF:
return EOF;
case '\n':
return nextchar(stream);
static int drop_stream_comment(stream_t *stream)
int newline;
int next;
newline = stream->newline;
next = nextchar(stream);
for (;;) {
int curr = next;
if (curr == EOF) {
warning(stream_pos(stream), "End of file in the middle of a comment");
return curr;
next = nextchar(stream);
if (curr == '*' && next == '/')
stream->newline = newline;
return nextchar(stream);
unsigned char combinations[][4] = COMBINATION_STRINGS;
/* hash function for two-character punctuators - all give unique values */
#define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
* note that we won't get false positives - special_hash(0,0) is 0 and
* entry 0 is filled (by +=), so all the missing ones are OK.
static unsigned char hash_results[32][2] = {
#define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
RES('+', '='), /* 00 */
RES('/', '='), /* 01 */
RES('^', '='), /* 05 */
RES('&', '&'), /* 07 */
RES('#', '#'), /* 08 */
RES('<', '<'), /* 0a */
RES('<', '='), /* 0c */
RES('!', '='), /* 0e */
RES('%', '='), /* 0f */
RES('-', '-'), /* 10 */
RES('-', '='), /* 11 */
RES('-', '>'), /* 13 */
RES('=', '='), /* 15 */
RES('&', '='), /* 17 */
RES('*', '='), /* 18 */
RES('.', '.'), /* 1a */
RES('+', '+'), /* 1b */
RES('|', '='), /* 1c */
RES('>', '='), /* 1d */
RES('|', '|'), /* 1e */
RES('>', '>') /* 1f */
#undef RES
static int code[32] = {
#define CODE(c0, c1, value) [special_hash(c0, c1)] = value
CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
CODE('<', '=', SPECIAL_LTE), /* 0c */
CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
CODE('=', '=', SPECIAL_EQUAL), /* 15 */
CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
CODE('>', '=', SPECIAL_GTE), /* 1d */
CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
#undef CODE
static int get_one_special(int c, stream_t *stream)
struct token *token;
int next, value, i;
next = nextchar(stream);
* Check for numbers, strings, character constants, and comments
switch (c) {
case '.':
if (next >= '0' && next <= '9')
return get_one_number(c, next, stream);
case '"':
return get_string_token(next, stream);
case '\'':
return get_char_token(next, stream);
case '/':
if (next == '/')
return drop_stream_eoln(stream);
if (next == '*')
return drop_stream_comment(stream);
* Check for combinations
value = c;
if (cclass[next + 1] & ValidSecond) {
i = special_hash(c, next);
if (hash_results[i][0] == c && hash_results[i][1] == next) {
value = code[i];
next = nextchar(stream);
if (value >= SPECIAL_LEFTSHIFT &&
next == "==."[value - SPECIAL_LEFTSHIFT]) {
value += 3;
next = nextchar(stream);
/* Pass it on.. */
token = stream->token;
token_type(token) = TOKEN_SPECIAL;
token->special = value;
return next;
#define IDENT_HASH_BITS (13)
#define ident_hash_init(c) (c)
#define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
#define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
static struct ident *hash_table[IDENT_HASH_SIZE];
static int ident_hit, ident_miss, idents;
void show_identifier_stats(void)
int i;
int distribution[100];
fprintf(stderr, "identifiers: %d hits, %d misses\n",
ident_hit, ident_miss);
for (i = 0; i < 100; i++)
distribution[i] = 0;
for (i = 0; i < IDENT_HASH_SIZE; i++) {
struct ident * ident = hash_table[i];
int count = 0;
while (ident) {
ident = ident->next;
if (count > 99)
count = 99;
for (i = 0; i < 100; i++) {
if (distribution[i])
fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
static struct ident *alloc_ident(const char *name, int len)
struct ident *ident = __alloc_ident(len);
ident->symbols = NULL;
ident->len = len;
ident->tainted = 0;
memcpy(ident->name, name, len);
return ident;
static struct ident * insert_hash(struct ident *ident, unsigned long hash)
ident->next = hash_table[hash];
hash_table[hash] = ident;
return ident;
static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
struct ident *ident;
struct ident **p;
p = &hash_table[hash];
while ((ident = *p) != NULL) {
if (ident->len == (unsigned char) len) {
if (strncmp(name, ident->name, len) != 0)
goto next;
return ident;
p = &ident->next;
ident = alloc_ident(name, len);
*p = ident;
ident->next = NULL;
return ident;
static unsigned long hash_name(const char *name, int len)
unsigned long hash;
const unsigned char *p = (const unsigned char *)name;
hash = ident_hash_init(*p++);
while (--len) {
unsigned int i = *p++;
hash = ident_hash_add(hash, i);
return ident_hash_end(hash);
struct ident *hash_ident(struct ident *ident)
return insert_hash(ident, hash_name(ident->name, ident->len));
struct ident *built_in_ident(const char *name)
int len = strlen(name);
return create_hashed_ident(name, len, hash_name(name, len));
struct token *built_in_token(int stream, const char *name)
struct token *token;
token = __alloc_token(0);
token-> = stream;
token_type(token) = TOKEN_IDENT;
token->ident = built_in_ident(name);
return token;
static int get_one_identifier(int c, stream_t *stream)
struct token *token;
struct ident *ident;
unsigned long hash;
char buf[256];
int len = 1;
int next;
hash = ident_hash_init(c);
buf[0] = c;
for (;;) {
next = nextchar(stream);
if (!(cclass[next + 1] & (Letter | Digit)))
if (len >= sizeof(buf))
hash = ident_hash_add(hash, next);
buf[len] = next;
hash = ident_hash_end(hash);
ident = create_hashed_ident(buf, len, hash);
/* Pass it on.. */
token = stream->token;
token_type(token) = TOKEN_IDENT;
token->ident = ident;
return next;
static int get_one_token(int c, stream_t *stream)
long class = cclass[c + 1];
if (class & Digit)
return get_one_number(c, nextchar(stream), stream);
if (class & Letter)
return get_one_identifier(c, stream);
return get_one_special(c, stream);
static struct token *setup_stream(stream_t *stream, int idx, int fd,
unsigned char *buf, unsigned int buf_size)
struct token *begin;
stream->nr = idx;
stream->line = 1;
stream->newline = 1;
stream->whitespace = 0;
stream->pos = 0;
stream->token = NULL;
stream->fd = fd;
stream->offset = 0;
stream->size = buf_size;
stream->buffer = buf;
begin = alloc_token(stream);
token_type(begin) = TOKEN_STREAMBEGIN;
stream->tokenlist = &begin->next;
return begin;
static void tokenize_stream(stream_t *stream, struct token *endtoken)
int c = nextchar(stream);
while (c != EOF) {
if (!isspace(c)) {
struct token *token = alloc_token(stream);
stream->token = token;
stream->newline = 0;
stream->whitespace = 0;
c = get_one_token(c, stream);
stream->whitespace = 1;
c = nextchar(stream);
mark_eof(stream, endtoken);
struct token * tokenize_buffer(void *buffer, unsigned long size, struct token *endtoken)
stream_t stream;
struct token *begin;
begin = setup_stream(&stream, 0, -1, buffer, size);
tokenize_stream(&stream, endtoken);
return begin;
struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
struct token *begin;
stream_t stream;
unsigned char buffer[BUFSIZE];
int idx;
idx = init_stream(name, fd, next_path);
if (idx < 0) {
// info(endtoken->pos, "File %s is const", name);
return endtoken;
begin = setup_stream(&stream, idx, fd, buffer, 0);
tokenize_stream(&stream, endtoken);
return begin;