Python 底層運作 02 - PyTokenizer_Get 分析

當 Python 在讀入程式碼的時候，第一步就是對程式碼 lexing，將程式碼拆解成 token，標記出每個元素的分類為何。

以下面的程式碼為例：

bar = 12345

1	bar = 12345

我們透過 tokenize module 來分析，會變成：

$ ./python -m tokenize tests.py
0,0-0,0:            ENCODING       ‘utf-8’ 
1,0-1,3:            NAME           ‘foo’ 
1,4-1,5:            OP             ‘=’ 
1,6-1,11:           NUMBER         ‘12345’ 
1,11-1,12:          NEWLINE        ‘\n’ 
2,0-2,1:            NL             ‘\n’ 
3,0-3,1:            NL             ‘\n’ 
4,0-4,0:            ENDMARKER      ”

$ ./python –m tokenize tests.py

0,0–0,0: ENCODING ‘utf-8’

1,0–1,3: NAME ‘foo’

1,4–1,5: OP ‘=’

1,6–1,11: NUMBER ‘12345’

1,11–1,12: NEWLINE ‘\n’

2,0–2,1: NL ‘\n’

3,0–3,1: NL ‘\n’

4,0–4,0: ENDMARKER ”

我們可以看到 foo 被標記為 NAME，= 標記為 OP (operator)，12345 則被標記為 NUMBER。

在 CPython 中，負責將程式碼標記成 token 的程式碼是 PyTokenizer_Get ，位於 Parser/tokenizer.c。PyTokenizer_Get 本身是 tok_get 的一個 wrapper，用來處理 decoding error 的 handling；其回傳值是該 token 的類別，這些類別的值在 Include/token.h 以 macro 的方式被定義出來；p_start 以及 p_end 則是該 token 的起始以及結束位置：

int
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
{
    int result = tok_get(tok, p_start, p_end);
    if (tok->decoding_erred) {
        result = ERRORTOKEN;
        tok->done = E_DECODE;
    }
    return result;
}

int

PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)

{

int result = tok_get(tok, p_start, p_end);

if (tok->decoding_erred) {

result = ERRORTOKEN;

tok->done = E_DECODE;

}

return result;

}

#define ENDMARKER	0
#define NAME		1
#define NUMBER		2
#define STRING		3
#define NEWLINE		4
#define INDENT		5
#define DEDENT		6
…
#define RIGHTSHIFTEQUAL	45
#define DOUBLESTAREQUAL	46
#define DOUBLESLASH	47
#define DOUBLESLASHEQUAL 48
#define AT              49
#define ATEQUAL		50
#define RARROW          51
#define ELLIPSIS        52
/* Don’t forget to update the table _PyParser_TokenNames in tokenizer.c! */
#define OP		53
#define AWAIT		54
#define ASYNC		55
#define ERRORTOKEN	56
#define N_TOKENS	57

/* Special definitions for cooperation with parser */

#define NT_OFFSET		256

#define ENDMARKER 0

#define NAME 1

#define NUMBER 2

#define STRING 3

#define NEWLINE 4

#define INDENT 5

#define DEDENT 6

...

#define RIGHTSHIFTEQUAL 45

#define DOUBLESTAREQUAL 46

#define DOUBLESLASH 47

#define DOUBLESLASHEQUAL 48

#define AT 49

#define ATEQUAL 50

#define RARROW 51

#define ELLIPSIS 52

/* Don’t forget to update the table _PyParser_TokenNames in tokenizer.c! */

#define OP 53

#define AWAIT 54

#define ASYNC 55

#define ERRORTOKEN 56

#define N_TOKENS 57

/* Special definitions for cooperation with parser */

#define NT_OFFSET 256

主要的處理集中在 tok_get 當中，這個 function 的目的是要獲得下一個 token，並且會將空白 strip 掉。

整段 tok_get 的 function code 有點長，大致上可以拆成兩段來看：

獲得 next token 的 first character
透過 first character 分類
- Identifier
- Newline
- Period or number starting with period
- Number
- String
- Line continuation
- Two-character token
- Parentheses nesting level

介紹 struct tok_state

稍候會用到 struct tok_state 因此必須先介紹這個 structure：

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc’ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    char *end;          /* End of input buffer if buf != NULL */
    char *start;        /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int level;          /* () [] {} Parentheses nesting level */

/* Tokenizer state */

struct tok_state {

/* Input state; buf <= cur <= inp <= end */

/* NB an entire line is held in the buffer */

char *buf; /* Input buffer, or NULL; malloc’ed if fp != NULL */

char *cur; /* Next character in buffer */

char *inp; /* End of data in buffer */

char *end; /* End of input buffer if buf != NULL */

char *start; /* Start of current token if not NULL */

int done; /* E_OK normally, E_EOF at EOF, otherwise error code */

/* NB If done != E_OK, cur must be == inp!!! */

FILE *fp; /* Rest of input; NULL if tokenizing a string */

int tabsize; /* Tab spacing */

int indent; /* Current indentation index */

int indstack[MAXINDENT]; /* Stack of indents */

int atbol; /* Nonzero if at begin of new line */

int pendin; /* Pending indents (if > 0) or dedents (if < 0) */

const char *prompt, *nextprompt; /* For interactive prompting */

int lineno; /* Current line number */

int level; /* () [] {} Parentheses nesting level */

獲得 next token 的 first character

static int
tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
    int c;
    int blankline, nonascii;

    *p_start = *p_end = NULL;
  nextline:
    tok->start = NULL;
    blankline = 0;

static int

tok_get(struct tok_state *tok, char **p_start, char **p_end)

{

int c;

int blankline, nonascii;

*p_start = *p_end = NULL;

nextline:

tok->start = NULL;

blankline = 0;

一些初始化動作。

    /* Get indentation level */
    if (tok->atbol) {
        int col = 0;
        int altcol = 0;
        tok->atbol = 0;
        for (;;) {
            c = tok_nextc(tok);
            if (c == ‘ ‘) {
                col++, altcol++;
            }
            else if (c == ‘\t’) {
                col = (col/tok->tabsize + 1) * tok->tabsize;
                altcol = (altcol/tok->alttabsize + 1)
                    * tok->alttabsize;
            }
            else if (c == ‘\014’)  {/* Control-L (formfeed) */
                col = altcol = 0; /* For Emacs users */
            }
            else {
                break;
            }
        }
        tok_backup(tok, c);

/* Get indentation level */

if (tok->atbol) {

int col = 0;

int altcol = 0;

tok->atbol = 0;

for (;;) {

c = tok_nextc(tok);

if (c == ‘ ‘) {

col++, altcol++;

}

else if (c == ‘\t’) {

col = (col/tok->tabsize + 1) * tok->tabsize;

altcol = (altcol/tok->alttabsize + 1)

* tok->alttabsize;

}

else if (c == ‘\014’) {/* Control-L (formfeed) */

col = altcol = 0; /* For Emacs users */

}

else {

break;

}

tok_backup(tok, c);

重點是在 tok_nextc 這 function。

tok_nextc – 擷取下一個 token 的字元

decoding_fgets, bring fp to buf
tok->done = E_OK;
tok->inp = strchr(tok->buf, ‘\0’);
done = tok->inp == tok->buf || tok->inp[-1] == ‘\n’;

透過 first character 分類

拿到 first character 後，大致上的程式碼如下：

/* Set start of current token */
tok->start = tok->cur – 1;

/* Skip spaces */
if (c == ‘#’) {
}

/* Check for EOF and errors now */
if (c == EOF) {
    return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
}

/* Identifier (most frequent token!) */
if (is_potential_identifier_start(c)) {
}

/* Newline */
if (c == ‘\n’) {
}

/* Period or number starting with period? */
if (c == ‘.’)  {
}

/* Number */
if (isdigit(c)) {
}

letter_quote:
/* String */
if (c == ‘\” || c = ‘”‘) {
}

/* Line continuation */
if (c == ‘\\’) {
}

/* Check for two-character token */
{
    int c2 = tok_nextc(tok);
    int token = PyToken_TwoChars(c, c2);
    if (token != OP) {
        int c3 = tok_nextc(tok);
        int token3 = PyToken_ThreeChars(c, c2, c3);
        if (token3 != OP) {
            token = token3;
        }
        else {
            tok_backup(tok, c3);
        }
        *p_start = tok->start;
        *p_end = tok->cur;
        return token;
    }
    tok_backup(tok, c2);
}

/* Keep track of parentheses nesting level */
switch (c) {
case ‘(‘:
case ‘[‘:
case ‘{‘:
    tok->level++;
    break;
case ‘)’:
case ‘]’:
case ‘}’:
    tok->level–;
    break;
}

/* Punctuation character */
*p_start = tok->start;
*p_end = tok->cur;
return PyToken_OneChar(c);

/* Set start of current token */

tok->start = tok->cur – 1;

/* Skip spaces */

if (c == ‘#’) {

}

/* Check for EOF and errors now */

if (c == EOF) {

return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;

}

/* Identifier (most frequent token!) */

if (is_potential_identifier_start(c)) {

}

/* Newline */

if (c == ‘\n’) {

}

/* Period or number starting with period? */

if (c == ‘.’) {

}

/* Number */

if (isdigit(c)) {

}

letter_quote:

/* String */

if (c == ‘\” || c = ‘”‘) {

}

/* Line continuation */

if (c == ‘\\’) {

}

/* Check for two-character token */

{

int c2 = tok_nextc(tok);

int token = PyToken_TwoChars(c, c2);

if (token != OP) {

int c3 = tok_nextc(tok);

int token3 = PyToken_ThreeChars(c, c2, c3);

if (token3 != OP) {

token = token3;

}

else {

tok_backup(tok, c3);

}

*p_start = tok->start;

*p_end = tok->cur;

return token;

}

tok_backup(tok, c2);

}

/* Keep track of parentheses nesting level */

switch (c) {

case ‘(‘:

case ‘[‘:

case ‘{‘:

tok->level++;

break;

case ‘)’:

case ‘]’:

case ‘}’:

tok->level—;

break;

}

/* Punctuation character */

*p_start = tok->start;

*p_end = tok->cur;

return PyToken_OneChar(c);

透過這些 if statement 來判斷目前的 token 屬於哪個 type，這裡也是解析一些特殊型態的數值的地方，例如說在 3.6 版新增的 “f-string”，可以看到會在第一段的地方處理：

    /* Identifier (most frequent token!) */
    nonascii = 0;
    if (is_potential_identifier_start(c)) {
        /* Process the various legal combinations of b””, r””, u””, and f””. */
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
        while (1) {
            if (!(saw_b || saw_u || saw_f) && (c == ‘b’ || c == ‘B’))
                saw_b = 1;
            /* Since this is a backwards compatibility support literal we don’t
               want to support it in arbitrary order like byte literals. */
            else if (!(saw_b || saw_u || saw_r || saw_f)
                     && (c == ‘u’|| c == ‘U’)) {
                saw_u = 1;
            }
            /* ur”” and ru”” are not supported */
            else if (!(saw_r || saw_u) && (c == ‘r’ || c == ‘R’)) {
                saw_r = 1;
            }
            else if (!(saw_f || saw_b || saw_u) && (c == ‘f’ || c == ‘F’)) {
                saw_f = 1;
            }
            else {
                break;
            }
            c = tok_nextc(tok);
            if (c == ‘”‘ || c == ‘\”) {
                goto letter_quote;
            }
        }

/* Identifier (most frequent token!) */

nonascii = 0;

if (is_potential_identifier_start(c)) {

/* Process the various legal combinations of b””, r””, u””, and f””. */

int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;

while (1) {

if (!(saw_b || saw_u || saw_f) && (c == ‘b’ || c == ‘B’))

saw_b = 1;

/* Since this is a backwards compatibility support literal we don’t

want to support it in arbitrary order like byte literals. */

else if (!(saw_b || saw_u || saw_r || saw_f)

&& (c == ‘u’|| c == ‘U’)) {

saw_u = 1;

}

/* ur”” and ru”” are not supported */

else if (!(saw_r || saw_u) && (c == ‘r’ || c == ‘R’)) {

saw_r = 1;

}

else if (!(saw_f || saw_b || saw_u) && (c == ‘f’ || c == ‘F’)) {

saw_f = 1;

}

else {

break;

}

c = tok_nextc(tok);

if (c == ‘”‘ || c == ‘\”) {

goto letter_quote;

}

或是新的底線數值 “10_000_000” 的解析，則是在 tok_decimal_tail 處理：

static int
tok_decimal_tail(struct tok_state *tok)
{
    int c;

    while (1) {
        do {
            c = tok_nextc(tok);
        } while (isdigit(c));
        if (c != ‘_’) {
            break;
        }
        c = tok_nextc(tok);
        if (!isdigit(c)) {
            tok->done = E_TOKEN;
            tok_backup(tok, c);
            return 0;
        }
    }
    return c;
}

static int

tok_decimal_tail(struct tok_state *tok)

{

int c;

while (1) {

do {

c = tok_nextc(tok);

} while (isdigit(c));

if (c != ‘_’) {

break;

}

c = tok_nextc(tok);

if (!isdigit(c)) {

tok->done = E_TOKEN;

tok_backup(tok, c);

return 0;

}

return c;

}

回到 parsetok

最後，PyTokenizer_Get 會將數值回傳到 parsetok 裏面：

type = PyTokenizer_Get(tok, &a, &b);

1	type = PyTokenizer_Get(tok, &a, &b);

type: 當前 token type
a: token_position_start
b: token_position_end

Python 底層運作 02 – PyTokenizer_Get 分析

介紹 struct tok_state

獲得 next token 的 first character

tok_nextc – 擷取下一個 token 的字元

透過 first character 分類

回到 parsetok

Comments

Leave a Reply Cancel reply