• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef _PY_LEXER_H_
2 #define _PY_LEXER_H_
3 
4 #include "object.h"
5 
6 #define MAXINDENT 100       /* Max indentation level */
7 #define MAXLEVEL 200        /* Max parentheses level */
8 #define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
9 
10 #define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
11 #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
12 
13 enum decoding_state {
14     STATE_INIT,
15     STATE_SEEK_CODING,
16     STATE_NORMAL
17 };
18 
19 enum interactive_underflow_t {
20     /* Normal mode of operation: return a new token when asked in interactive mode */
21     IUNDERFLOW_NORMAL,
22     /* Forcefully return ENDMARKER when asked for a new token in interactive mode. This
23      * can be used to prevent the tokenizer to prompt the user for new tokens */
24     IUNDERFLOW_STOP,
25 };
26 
27 struct token {
28     int level;
29     int lineno, col_offset, end_lineno, end_col_offset;
30     const char *start, *end;
31     PyObject *metadata;
32 };
33 
34 enum tokenizer_mode_kind_t {
35     TOK_REGULAR_MODE,
36     TOK_FSTRING_MODE,
37 };
38 
39 #define MAX_EXPR_NESTING 3
40 
41 typedef struct _tokenizer_mode {
42     enum tokenizer_mode_kind_t kind;
43 
44     int curly_bracket_depth;
45     int curly_bracket_expr_start_depth;
46 
47     char f_string_quote;
48     int f_string_quote_size;
49     int f_string_raw;
50     const char* f_string_start;
51     const char* f_string_multi_line_start;
52     int f_string_line_start;
53 
54     Py_ssize_t f_string_start_offset;
55     Py_ssize_t f_string_multi_line_start_offset;
56 
57     Py_ssize_t last_expr_size;
58     Py_ssize_t last_expr_end;
59     char* last_expr_buffer;
60     int f_string_debug;
61     int in_format_spec;
62 } tokenizer_mode;
63 
64 /* Tokenizer state */
65 struct tok_state {
66     /* Input state; buf <= cur <= inp <= end */
67     /* NB an entire line is held in the buffer */
68     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL or readline != NULL */
69     char *cur;          /* Next character in buffer */
70     char *inp;          /* End of data in buffer */
71     int fp_interactive; /* If the file descriptor is interactive */
72     char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
73     char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
74     const char *end;    /* End of input buffer if buf != NULL */
75     const char *start;  /* Start of current token if not NULL */
76     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
77     /* NB If done != E_OK, cur must be == inp!!! */
78     FILE *fp;           /* Rest of input; NULL if tokenizing a string */
79     int tabsize;        /* Tab spacing */
80     int indent;         /* Current indentation index */
81     int indstack[MAXINDENT];            /* Stack of indents */
82     int atbol;          /* Nonzero if at begin of new line */
83     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
84     const char *prompt, *nextprompt;          /* For interactive prompting */
85     int lineno;         /* Current line number */
86     int first_lineno;   /* First line of a single line or multi line string
87                            expression (cf. issue 16806) */
88     int starting_col_offset; /* The column offset at the beginning of a token */
89     int col_offset;     /* Current col offset */
90     int level;          /* () [] {} Parentheses nesting level */
91             /* Used to allow free continuations inside them */
92     char parenstack[MAXLEVEL];
93     int parenlinenostack[MAXLEVEL];
94     int parencolstack[MAXLEVEL];
95     PyObject *filename;
96     /* Stuff for checking on different tab sizes */
97     int altindstack[MAXINDENT];         /* Stack of alternate indents */
98     /* Stuff for PEP 0263 */
99     enum decoding_state decoding_state;
100     int decoding_erred;         /* whether erred in decoding  */
101     char *encoding;         /* Source encoding. */
102     int cont_line;          /* whether we are in a continuation line. */
103     const char* line_start;     /* pointer to start of current line */
104     const char* multi_line_start; /* pointer to start of first line of
105                                      a single line or multi line string
106                                      expression (cf. issue 16806) */
107     PyObject *decoding_readline; /* open(...).readline */
108     PyObject *decoding_buffer;
109     PyObject *readline;     /* readline() function */
110     const char* enc;        /* Encoding for the current str. */
111     char* str;          /* Source string being tokenized (if tokenizing from a string)*/
112     char* input;       /* Tokenizer's newline translated copy of the string. */
113 
114     int type_comments;      /* Whether to look for type comments */
115 
116     /* How to proceed when asked for a new token in interactive mode */
117     enum interactive_underflow_t interactive_underflow;
118     int (*underflow)(struct tok_state *); /* Function to call when buffer is empty and we need to refill it*/
119 
120     int report_warnings;
121     // TODO: Factor this into its own thing
122     tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
123     int tok_mode_stack_index;
124     int tok_extra_tokens;
125     int comment_newline;
126     int implicit_newline;
127 #ifdef Py_DEBUG
128     int debug;
129 #endif
130 };
131 
132 int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
133                          int end_col_offset, const char *start, const char *end);
134 int _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end);
135 
136 struct tok_state *_PyTokenizer_tok_new(void);
137 void _PyTokenizer_Free(struct tok_state *);
138 void _PyToken_Free(struct token *);
139 void _PyToken_Init(struct token *);
140 
141 
142 #endif
143