1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21
22 namespace Fortran::parser {
23
24 using common::LanguageFeature;
25
26 static constexpr int maxPrescannerNesting{100};
27
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31 allSources_{preprocessor_.allSources()}, features_{lfc},
32 encoding_{allSources_.encoding()} {}
33
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35 : messages_{that.messages_}, cooked_{that.cooked_},
36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37 features_{that.features_}, inFixedForm_{that.inFixedForm_},
38 fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40 1},
41 skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50 char *p{dir.GetMutableCharData()};
51 char *limit{p + dir.SizeInChars()};
52 for (; p < limit; ++p) {
53 if (*p != ' ') {
54 CHECK(IsFixedFormCommentChar(*p));
55 *p = '!';
56 return;
57 }
58 }
59 DIE("compiler directive all blank");
60 }
61
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63 startProvenance_ = range.start();
64 start_ = allSources_.GetSource(range);
65 CHECK(start_);
66 limit_ = start_ + range.size();
67 nextLine_ = start_;
68 const bool beganInFixedForm{inFixedForm_};
69 if (prescannerNesting_ > maxPrescannerNesting) {
70 Say(GetProvenance(start_),
71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72 return;
73 }
74 while (!IsAtEnd()) {
75 Statement();
76 }
77 if (inFixedForm_ != beganInFixedForm) {
78 std::string dir{"!dir$ "};
79 if (beganInFixedForm) {
80 dir += "fixed";
81 } else {
82 dir += "free";
83 }
84 dir += '\n';
85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86 tokens.Emit(cooked_);
87 }
88 }
89
Statement()90 void Prescanner::Statement() {
91 TokenSequence tokens;
92 LineClassification line{ClassifyLine(nextLine_)};
93 switch (line.kind) {
94 case LineClassification::Kind::Comment:
95 nextLine_ += line.payloadOffset; // advance to '!' or newline
96 NextLine();
97 return;
98 case LineClassification::Kind::IncludeLine:
99 FortranInclude(nextLine_ + line.payloadOffset);
100 NextLine();
101 return;
102 case LineClassification::Kind::ConditionalCompilationDirective:
103 case LineClassification::Kind::IncludeDirective:
104 case LineClassification::Kind::DefinitionDirective:
105 case LineClassification::Kind::PreprocessorDirective:
106 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
107 return;
108 case LineClassification::Kind::CompilerDirective:
109 directiveSentinel_ = line.sentinel;
110 CHECK(InCompilerDirective());
111 BeginStatementAndAdvance();
112 if (inFixedForm_) {
113 CHECK(IsFixedFormCommentChar(*at_));
114 } else {
115 while (*at_ == ' ' || *at_ == '\t') {
116 ++at_, ++column_;
117 }
118 CHECK(*at_ == '!');
119 }
120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121 // OpenMP conditional compilation line. Remove the sentinel and then
122 // treat the line as if it were normal source.
123 at_ += 2, column_ += 2;
124 if (inFixedForm_) {
125 LabelField(tokens);
126 } else {
127 SkipSpaces();
128 }
129 } else {
130 // Compiler directive. Emit normalized sentinel.
131 EmitChar(tokens, '!');
132 ++at_, ++column_;
133 for (const char *sp{directiveSentinel_}; *sp != '\0';
134 ++sp, ++at_, ++column_) {
135 EmitChar(tokens, *sp);
136 }
137 if (*at_ == ' ') {
138 EmitChar(tokens, ' ');
139 ++at_, ++column_;
140 }
141 tokens.CloseToken();
142 }
143 break;
144 case LineClassification::Kind::Source:
145 BeginStatementAndAdvance();
146 if (inFixedForm_) {
147 LabelField(tokens);
148 } else if (skipLeadingAmpersand_) {
149 skipLeadingAmpersand_ = false;
150 const char *p{SkipWhiteSpace(at_)};
151 if (p < limit_ && *p == '&') {
152 column_ += ++p - at_;
153 at_ = p;
154 }
155 } else {
156 SkipSpaces();
157 }
158 break;
159 }
160
161 while (NextToken(tokens)) {
162 }
163
164 Provenance newlineProvenance{GetCurrentProvenance()};
165 if (std::optional<TokenSequence> preprocessed{
166 preprocessor_.MacroReplacement(tokens, *this)}) {
167 // Reprocess the preprocessed line. Append a newline temporarily.
168 preprocessed->PutNextTokenChar('\n', newlineProvenance);
169 preprocessed->CloseToken();
170 const char *ppd{preprocessed->ToCharBlock().begin()};
171 LineClassification ppl{ClassifyLine(ppd)};
172 preprocessed->RemoveLastToken(); // remove the newline
173 switch (ppl.kind) {
174 case LineClassification::Kind::Comment:
175 break;
176 case LineClassification::Kind::IncludeLine:
177 FortranInclude(ppd + ppl.payloadOffset);
178 break;
179 case LineClassification::Kind::ConditionalCompilationDirective:
180 case LineClassification::Kind::IncludeDirective:
181 case LineClassification::Kind::DefinitionDirective:
182 case LineClassification::Kind::PreprocessorDirective:
183 Say(preprocessed->GetProvenanceRange(),
184 "Preprocessed line resembles a preprocessor directive"_en_US);
185 preprocessed->ToLowerCase().CheckBadFortranCharacters(messages_).Emit(
186 cooked_);
187 break;
188 case LineClassification::Kind::CompilerDirective:
189 if (preprocessed->HasRedundantBlanks()) {
190 preprocessed->RemoveRedundantBlanks();
191 }
192 NormalizeCompilerDirectiveCommentMarker(*preprocessed);
193 preprocessed->ToLowerCase();
194 SourceFormChange(preprocessed->ToString());
195 preprocessed->ClipComment(true /* skip first ! */)
196 .CheckBadFortranCharacters(messages_)
197 .Emit(cooked_);
198 break;
199 case LineClassification::Kind::Source:
200 if (inFixedForm_) {
201 if (preprocessed->HasBlanks(/*after column*/ 6)) {
202 preprocessed->RemoveBlanks(/*after column*/ 6);
203 }
204 } else {
205 if (preprocessed->HasRedundantBlanks()) {
206 preprocessed->RemoveRedundantBlanks();
207 }
208 }
209 preprocessed->ToLowerCase()
210 .ClipComment()
211 .CheckBadFortranCharacters(messages_)
212 .Emit(cooked_);
213 break;
214 }
215 } else {
216 tokens.ToLowerCase();
217 if (line.kind == LineClassification::Kind::CompilerDirective) {
218 SourceFormChange(tokens.ToString());
219 }
220 tokens.CheckBadFortranCharacters(messages_).Emit(cooked_);
221 }
222 if (omitNewline_) {
223 omitNewline_ = false;
224 } else {
225 cooked_.Put('\n', newlineProvenance);
226 }
227 directiveSentinel_ = nullptr;
228 }
229
TokenizePreprocessorDirective()230 TokenSequence Prescanner::TokenizePreprocessorDirective() {
231 CHECK(!IsAtEnd() && !inPreprocessorDirective_);
232 inPreprocessorDirective_ = true;
233 BeginStatementAndAdvance();
234 TokenSequence tokens;
235 while (NextToken(tokens)) {
236 }
237 inPreprocessorDirective_ = false;
238 return tokens;
239 }
240
NextLine()241 void Prescanner::NextLine() {
242 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
243 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
244 if (!v) {
245 nextLine_ = limit_;
246 } else {
247 const char *nl{const_cast<const char *>(static_cast<char *>(v))};
248 nextLine_ = nl + 1;
249 }
250 }
251
LabelField(TokenSequence & token)252 void Prescanner::LabelField(TokenSequence &token) {
253 const char *bad{nullptr};
254 int outCol{1};
255 for (; *at_ != '\n' && column_ <= 6; ++at_) {
256 if (*at_ == '\t') {
257 ++at_;
258 column_ = 7;
259 break;
260 }
261 if (*at_ != ' ' &&
262 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
263 EmitChar(token, *at_);
264 ++outCol;
265 if (!bad && !IsDecimalDigit(*at_)) {
266 bad = at_;
267 }
268 }
269 ++column_;
270 }
271 if (outCol == 1) { // empty label field
272 // Emit a space so that, if the line is rescanned after preprocessing,
273 // a leading 'C' or 'D' won't be left-justified and then accidentally
274 // misinterpreted as a comment card.
275 EmitChar(token, ' ');
276 ++outCol;
277 } else {
278 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
279 Say(GetProvenance(bad),
280 "Character in fixed-form label field must be a digit"_en_US);
281 }
282 }
283 token.CloseToken();
284 SkipToNextSignificantCharacter();
285 if (IsDecimalDigit(*at_)) {
286 Say(GetProvenance(at_),
287 "Label digit is not in fixed-form label field"_en_US);
288 }
289 }
290
SkipToEndOfLine()291 void Prescanner::SkipToEndOfLine() {
292 while (*at_ != '\n') {
293 ++at_, ++column_;
294 }
295 }
296
MustSkipToEndOfLine() const297 bool Prescanner::MustSkipToEndOfLine() const {
298 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
299 return true; // skip over ignored columns in right margin (73:80)
300 } else if (*at_ == '!' && !inCharLiteral_) {
301 return true; // inline comment goes to end of source line
302 } else {
303 return false;
304 }
305 }
306
NextChar()307 void Prescanner::NextChar() {
308 CHECK(*at_ != '\n');
309 ++at_, ++column_;
310 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
311 // UTF-8 byte order mark - treat this file as UTF-8
312 at_ += 3;
313 encoding_ = Encoding::UTF_8;
314 }
315 SkipToNextSignificantCharacter();
316 }
317
318 // Skip everything that should be ignored until the next significant
319 // character is reached; handles C-style comments in preprocessing
320 // directives, Fortran ! comments, stuff after the right margin in
321 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()322 void Prescanner::SkipToNextSignificantCharacter() {
323 if (inPreprocessorDirective_) {
324 SkipCComments();
325 } else {
326 bool mightNeedSpace{false};
327 if (MustSkipToEndOfLine()) {
328 SkipToEndOfLine();
329 } else {
330 mightNeedSpace = *at_ == '\n';
331 }
332 for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
333 if (MustSkipToEndOfLine()) {
334 SkipToEndOfLine();
335 }
336 }
337 if (*at_ == '\t') {
338 tabInCurrentLine_ = true;
339 }
340 }
341 }
342
SkipCComments()343 void Prescanner::SkipCComments() {
344 while (true) {
345 if (IsCComment(at_)) {
346 if (const char *after{SkipCComment(at_)}) {
347 column_ += after - at_;
348 // May have skipped over one or more newlines; relocate the start of
349 // the next line.
350 nextLine_ = at_ = after;
351 NextLine();
352 } else {
353 // Don't emit any messages about unclosed C-style comments, because
354 // the sequence /* can appear legally in a FORMAT statement. There's
355 // no ambiguity, since the sequence */ cannot appear legally.
356 break;
357 }
358 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
359 at_[1] == '\n' && !IsAtEnd()) {
360 BeginSourceLineAndAdvance();
361 } else {
362 break;
363 }
364 }
365 }
366
SkipSpaces()367 void Prescanner::SkipSpaces() {
368 while (*at_ == ' ' || *at_ == '\t') {
369 NextChar();
370 }
371 insertASpace_ = false;
372 }
373
SkipWhiteSpace(const char * p)374 const char *Prescanner::SkipWhiteSpace(const char *p) {
375 while (*p == ' ' || *p == '\t') {
376 ++p;
377 }
378 return p;
379 }
380
SkipWhiteSpaceAndCComments(const char * p) const381 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
382 while (true) {
383 if (*p == ' ' || *p == '\t') {
384 ++p;
385 } else if (IsCComment(p)) {
386 if (const char *after{SkipCComment(p)}) {
387 p = after;
388 } else {
389 break;
390 }
391 } else {
392 break;
393 }
394 }
395 return p;
396 }
397
SkipCComment(const char * p) const398 const char *Prescanner::SkipCComment(const char *p) const {
399 char star{' '}, slash{' '};
400 p += 2;
401 while (star != '*' || slash != '/') {
402 if (p >= limit_) {
403 return nullptr; // signifies an unterminated comment
404 }
405 star = slash;
406 slash = *p++;
407 }
408 return p;
409 }
410
NextToken(TokenSequence & tokens)411 bool Prescanner::NextToken(TokenSequence &tokens) {
412 CHECK(at_ >= start_ && at_ < limit_);
413 if (InFixedFormSource()) {
414 SkipSpaces();
415 } else {
416 if (*at_ == '/' && IsCComment(at_)) {
417 // Recognize and skip over classic C style /*comments*/ when
418 // outside a character literal.
419 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
420 Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
421 }
422 SkipCComments();
423 }
424 if (*at_ == ' ' || *at_ == '\t') {
425 // Compress free-form white space into a single space character.
426 const auto theSpace{at_};
427 char previous{at_ <= start_ ? ' ' : at_[-1]};
428 NextChar();
429 SkipSpaces();
430 if (*at_ == '\n') {
431 // Discard white space at the end of a line.
432 } else if (!inPreprocessorDirective_ &&
433 (previous == '(' || *at_ == '(' || *at_ == ')')) {
434 // Discard white space before/after '(' and before ')', unless in a
435 // preprocessor directive. This helps yield space-free contiguous
436 // names for generic interfaces like OPERATOR( + ) and
437 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
438 // This has the effect of silently ignoring the illegal spaces in
439 // the array constructor ( /1,2/ ) but that seems benign; it's
440 // hard to avoid that while still removing spaces from OPERATOR( / )
441 // and OPERATOR( // ).
442 } else {
443 // Preserve the squashed white space as a single space character.
444 tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
445 tokens.CloseToken();
446 return true;
447 }
448 }
449 }
450 if (insertASpace_) {
451 tokens.PutNextTokenChar(' ', spaceProvenance_);
452 insertASpace_ = false;
453 }
454 if (*at_ == '\n') {
455 return false;
456 }
457 const char *start{at_};
458 if (*at_ == '\'' || *at_ == '"') {
459 QuotedCharacterLiteral(tokens, start);
460 preventHollerith_ = false;
461 } else if (IsDecimalDigit(*at_)) {
462 int n{0}, digits{0};
463 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
464 do {
465 if (n < maxHollerith) {
466 n = 10 * n + DecimalDigitValue(*at_);
467 }
468 EmitCharAndAdvance(tokens, *at_);
469 ++digits;
470 if (InFixedFormSource()) {
471 SkipSpaces();
472 }
473 } while (IsDecimalDigit(*at_));
474 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
475 !preventHollerith_) {
476 Hollerith(tokens, n, start);
477 } else if (*at_ == '.') {
478 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
479 }
480 ExponentAndKind(tokens);
481 } else if (ExponentAndKind(tokens)) {
482 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
483 inPreprocessorDirective_) {
484 do {
485 EmitCharAndAdvance(tokens, *at_);
486 } while (IsHexadecimalDigit(*at_));
487 } else if (IsLetter(*at_)) {
488 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
489 // we don't misrecognize I9HOLLERITH as an identifier in the next case.
490 EmitCharAndAdvance(tokens, *at_);
491 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
492 EmitCharAndAdvance(tokens, *at_);
493 QuotedCharacterLiteral(tokens, start);
494 }
495 preventHollerith_ = false;
496 } else if (*at_ == '.') {
497 char nch{EmitCharAndAdvance(tokens, '.')};
498 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
499 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
500 }
501 ExponentAndKind(tokens);
502 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
503 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
504 }
505 preventHollerith_ = false;
506 } else if (IsLegalInIdentifier(*at_)) {
507 do {
508 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
509 if ((*at_ == '\'' || *at_ == '"') &&
510 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
511 QuotedCharacterLiteral(tokens, start);
512 }
513 preventHollerith_ = false;
514 } else if (*at_ == '*') {
515 if (EmitCharAndAdvance(tokens, '*') == '*') {
516 EmitCharAndAdvance(tokens, '*');
517 } else {
518 // Subtle ambiguity:
519 // CHARACTER*2H declares H because *2 is a kind specifier
520 // DATAC/N*2H / is repeated Hollerith
521 preventHollerith_ = !slashInCurrentStatement_;
522 }
523 } else {
524 char ch{*at_};
525 if (ch == '(' || ch == '[') {
526 ++delimiterNesting_;
527 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
528 --delimiterNesting_;
529 }
530 char nch{EmitCharAndAdvance(tokens, ch)};
531 preventHollerith_ = false;
532 if ((nch == '=' &&
533 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
534 (ch == nch &&
535 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
536 ch == '|' || ch == '<' || ch == '>')) ||
537 (ch == '=' && nch == '>')) {
538 // token comprises two characters
539 EmitCharAndAdvance(tokens, nch);
540 } else if (ch == '/') {
541 slashInCurrentStatement_ = true;
542 }
543 }
544 tokens.CloseToken();
545 return true;
546 }
547
ExponentAndKind(TokenSequence & tokens)548 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
549 char ed{ToLowerCaseLetter(*at_)};
550 if (ed != 'e' && ed != 'd') {
551 return false;
552 }
553 EmitCharAndAdvance(tokens, ed);
554 if (*at_ == '+' || *at_ == '-') {
555 EmitCharAndAdvance(tokens, *at_);
556 }
557 while (IsDecimalDigit(*at_)) {
558 EmitCharAndAdvance(tokens, *at_);
559 }
560 if (*at_ == '_') {
561 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
562 }
563 }
564 return true;
565 }
566
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)567 void Prescanner::QuotedCharacterLiteral(
568 TokenSequence &tokens, const char *start) {
569 char quote{*at_};
570 const char *end{at_ + 1};
571 inCharLiteral_ = true;
572 const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
573 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
574 bool isEscaped{false};
575 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
576 while (true) {
577 if (*at_ == '\\') {
578 if (escapesEnabled) {
579 isEscaped = !isEscaped;
580 } else {
581 // The parser always processes escape sequences, so don't confuse it
582 // when escapes are disabled.
583 insert('\\');
584 }
585 } else {
586 isEscaped = false;
587 }
588 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
589 Encoding::LATIN_1);
590 while (PadOutCharacterLiteral(tokens)) {
591 }
592 if (*at_ == '\n') {
593 if (!inPreprocessorDirective_) {
594 Say(GetProvenanceRange(start, end),
595 "Incomplete character literal"_err_en_US);
596 }
597 break;
598 }
599 end = at_ + 1;
600 NextChar();
601 if (*at_ == quote && !isEscaped) {
602 // A doubled unescaped quote mark becomes a single instance of that
603 // quote character in the literal (later). There can be spaces between
604 // the quotes in fixed form source.
605 EmitChar(tokens, quote);
606 inCharLiteral_ = false; // for cases like print *, '...'!comment
607 NextChar();
608 if (InFixedFormSource()) {
609 SkipSpaces();
610 }
611 if (*at_ != quote) {
612 break;
613 }
614 inCharLiteral_ = true;
615 }
616 }
617 inCharLiteral_ = false;
618 }
619
Hollerith(TokenSequence & tokens,int count,const char * start)620 void Prescanner::Hollerith(
621 TokenSequence &tokens, int count, const char *start) {
622 inCharLiteral_ = true;
623 CHECK(*at_ == 'h' || *at_ == 'H');
624 EmitChar(tokens, 'H');
625 while (count-- > 0) {
626 if (PadOutCharacterLiteral(tokens)) {
627 } else if (*at_ == '\n') {
628 Say(GetProvenanceRange(start, at_),
629 "Possible truncated Hollerith literal"_en_US);
630 break;
631 } else {
632 NextChar();
633 // Each multi-byte character encoding counts as a single character.
634 // No escape sequences are recognized.
635 // Hollerith is always emitted to the cooked character
636 // stream in UTF-8.
637 DecodedCharacter decoded{DecodeCharacter(
638 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
639 if (decoded.bytes > 0) {
640 EncodedCharacter utf8{
641 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
642 for (int j{0}; j < utf8.bytes; ++j) {
643 EmitChar(tokens, utf8.buffer[j]);
644 }
645 at_ += decoded.bytes - 1;
646 } else {
647 Say(GetProvenanceRange(start, at_),
648 "Bad character in Hollerith literal"_err_en_US);
649 break;
650 }
651 }
652 }
653 if (*at_ != '\n') {
654 NextChar();
655 }
656 inCharLiteral_ = false;
657 }
658
659 // In fixed form, source card images must be processed as if they were at
660 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)661 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
662 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
663 if (column_ < fixedFormColumnLimit_) {
664 tokens.PutNextTokenChar(' ', spaceProvenance_);
665 ++column_;
666 return true;
667 }
668 if (!FixedFormContinuation(false /*no need to insert space*/) ||
669 tabInCurrentLine_) {
670 return false;
671 }
672 CHECK(column_ == 7);
673 --at_; // point to column 6 of continuation line
674 column_ = 6;
675 }
676 return false;
677 }
678
IsFixedFormCommentLine(const char * start) const679 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
680 const char *p{start};
681 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
682 ((*p == 'D' || *p == 'd') &&
683 !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
684 return true;
685 }
686 bool anyTabs{false};
687 while (true) {
688 if (*p == ' ') {
689 ++p;
690 } else if (*p == '\t') {
691 anyTabs = true;
692 ++p;
693 } else if (*p == '0' && !anyTabs && p == start + 5) {
694 ++p; // 0 in column 6 must treated as a space
695 } else {
696 break;
697 }
698 }
699 if (!anyTabs && p >= start + fixedFormColumnLimit_) {
700 return true;
701 }
702 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
703 return true;
704 }
705 return *p == '\n';
706 }
707
IsFreeFormComment(const char * p) const708 const char *Prescanner::IsFreeFormComment(const char *p) const {
709 p = SkipWhiteSpaceAndCComments(p);
710 if (*p == '!' || *p == '\n') {
711 return p;
712 } else {
713 return nullptr;
714 }
715 }
716
IsIncludeLine(const char * start) const717 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
718 const char *p{SkipWhiteSpace(start)};
719 for (char ch : "include"s) {
720 if (ToLowerCaseLetter(*p++) != ch) {
721 return std::nullopt;
722 }
723 }
724 p = SkipWhiteSpace(p);
725 if (*p == '"' || *p == '\'') {
726 return {p - start};
727 }
728 return std::nullopt;
729 }
730
FortranInclude(const char * firstQuote)731 void Prescanner::FortranInclude(const char *firstQuote) {
732 const char *p{firstQuote};
733 while (*p != '"' && *p != '\'') {
734 ++p;
735 }
736 char quote{*p};
737 std::string path;
738 for (++p; *p != '\n'; ++p) {
739 if (*p == quote) {
740 if (p[1] != quote) {
741 break;
742 }
743 ++p;
744 }
745 path += *p;
746 }
747 if (*p != quote) {
748 Say(GetProvenanceRange(firstQuote, p),
749 "malformed path name string"_err_en_US);
750 return;
751 }
752 p = SkipWhiteSpace(p + 1);
753 if (*p != '\n' && *p != '!') {
754 const char *garbage{p};
755 for (; *p != '\n' && *p != '!'; ++p) {
756 }
757 Say(GetProvenanceRange(garbage, p),
758 "excess characters after path name"_en_US);
759 }
760 std::string buf;
761 llvm::raw_string_ostream error{buf};
762 Provenance provenance{GetProvenance(nextLine_)};
763 const SourceFile *currentFile{allSources_.GetSourceFile(provenance)};
764 if (currentFile) {
765 allSources_.PushSearchPathDirectory(DirectoryName(currentFile->path()));
766 }
767 const SourceFile *included{allSources_.Open(path, error)};
768 if (currentFile) {
769 allSources_.PopSearchPathDirectory();
770 }
771 if (!included) {
772 Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
773 } else if (included->bytes() > 0) {
774 ProvenanceRange includeLineRange{
775 provenance, static_cast<std::size_t>(p - nextLine_)};
776 ProvenanceRange fileRange{
777 allSources_.AddIncludedFile(*included, includeLineRange)};
778 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
779 }
780 }
781
IsPreprocessorDirectiveLine(const char * start) const782 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
783 const char *p{start};
784 for (; *p == ' '; ++p) {
785 }
786 if (*p == '#') {
787 if (inFixedForm_ && p == start + 5) {
788 return nullptr;
789 }
790 } else {
791 p = SkipWhiteSpace(p);
792 if (*p != '#') {
793 return nullptr;
794 }
795 }
796 return SkipWhiteSpace(p + 1);
797 }
798
IsNextLinePreprocessorDirective() const799 bool Prescanner::IsNextLinePreprocessorDirective() const {
800 return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
801 }
802
SkipCommentLine(bool afterAmpersand)803 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
804 if (IsAtEnd()) {
805 if (afterAmpersand && prescannerNesting_ > 0) {
806 // A continuation marker at the end of the last line in an
807 // include file inhibits the newline for that line.
808 SkipToEndOfLine();
809 omitNewline_ = true;
810 }
811 return false;
812 }
813 auto lineClass{ClassifyLine(nextLine_)};
814 if (lineClass.kind == LineClassification::Kind::Comment) {
815 NextLine();
816 return true;
817 } else if (inPreprocessorDirective_) {
818 return false;
819 } else if (lineClass.kind ==
820 LineClassification::Kind::ConditionalCompilationDirective ||
821 lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
822 // Allow conditional compilation directives (e.g., #ifdef) to affect
823 // continuation lines.
824 // Allow other preprocessor directives, too, except #include
825 // (when it does not follow '&'), #define, and #undef (because
826 // they cannot be allowed to affect preceding text on a
827 // continued line).
828 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
829 return true;
830 } else if (afterAmpersand &&
831 (lineClass.kind == LineClassification::Kind::IncludeDirective ||
832 lineClass.kind == LineClassification::Kind::IncludeLine)) {
833 SkipToEndOfLine();
834 omitNewline_ = true;
835 skipLeadingAmpersand_ = true;
836 return false;
837 } else {
838 return false;
839 }
840 }
841
FixedFormContinuationLine(bool mightNeedSpace)842 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
843 if (IsAtEnd()) {
844 return nullptr;
845 }
846 tabInCurrentLine_ = false;
847 char col1{*nextLine_};
848 if (InCompilerDirective()) {
849 // Must be a continued compiler directive.
850 if (!IsFixedFormCommentChar(col1)) {
851 return nullptr;
852 }
853 int j{1};
854 for (; j < 5; ++j) {
855 char ch{directiveSentinel_[j - 1]};
856 if (ch == '\0') {
857 break;
858 }
859 if (ch != ToLowerCaseLetter(nextLine_[j])) {
860 return nullptr;
861 }
862 }
863 for (; j < 5; ++j) {
864 if (nextLine_[j] != ' ') {
865 return nullptr;
866 }
867 }
868 char col6{nextLine_[5]};
869 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
870 if (nextLine_[6] != ' ' && mightNeedSpace) {
871 insertASpace_ = true;
872 }
873 return nextLine_ + 6;
874 }
875 return nullptr;
876 } else {
877 // Normal case: not in a compiler directive.
878 if (col1 == '&' &&
879 features_.IsEnabled(
880 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
881 // Extension: '&' as continuation marker
882 if (features_.ShouldWarn(
883 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
884 Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
885 }
886 return nextLine_ + 1;
887 }
888 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
889 tabInCurrentLine_ = true;
890 return nextLine_ + 2; // VAX extension
891 }
892 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
893 nextLine_[3] == ' ' && nextLine_[4] == ' ') {
894 char col6{nextLine_[5]};
895 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
896 return nextLine_ + 6;
897 }
898 }
899 if (IsImplicitContinuation()) {
900 return nextLine_;
901 }
902 }
903 return nullptr; // not a continuation line
904 }
905
FreeFormContinuationLine(bool ampersand)906 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
907 const char *p{nextLine_};
908 if (p >= limit_) {
909 return nullptr;
910 }
911 p = SkipWhiteSpace(p);
912 if (InCompilerDirective()) {
913 if (*p++ != '!') {
914 return nullptr;
915 }
916 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
917 if (*s != ToLowerCaseLetter(*p)) {
918 return nullptr;
919 }
920 }
921 p = SkipWhiteSpace(p);
922 if (*p == '&') {
923 if (!ampersand) {
924 insertASpace_ = true;
925 }
926 return p + 1;
927 } else if (ampersand) {
928 return p;
929 } else {
930 return nullptr;
931 }
932 } else {
933 if (*p == '&') {
934 return p + 1;
935 } else if (*p == '!' || *p == '\n' || *p == '#') {
936 return nullptr;
937 } else if (ampersand || IsImplicitContinuation()) {
938 if (p > nextLine_) {
939 --p;
940 } else {
941 insertASpace_ = true;
942 }
943 return p;
944 } else {
945 return nullptr;
946 }
947 }
948 }
949
FixedFormContinuation(bool mightNeedSpace)950 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
951 // N.B. We accept '&' as a continuation indicator in fixed form, too,
952 // but not in a character literal.
953 if (*at_ == '&' && inCharLiteral_) {
954 return false;
955 }
956 do {
957 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
958 BeginSourceLine(cont);
959 column_ = 7;
960 NextLine();
961 return true;
962 }
963 } while (SkipCommentLine(false /* not after ampersand */));
964 return false;
965 }
966
FreeFormContinuation()967 bool Prescanner::FreeFormContinuation() {
968 const char *p{at_};
969 bool ampersand{*p == '&'};
970 if (ampersand) {
971 p = SkipWhiteSpace(p + 1);
972 }
973 if (*p != '\n') {
974 if (inCharLiteral_) {
975 return false;
976 } else if (*p != '!' &&
977 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
978 Say(GetProvenance(p), "missing ! before comment after &"_en_US);
979 }
980 }
981 do {
982 if (const char *cont{FreeFormContinuationLine(ampersand)}) {
983 BeginSourceLine(cont);
984 NextLine();
985 return true;
986 }
987 } while (SkipCommentLine(ampersand));
988 return false;
989 }
990
991 // Implicit line continuation allows a preprocessor macro call with
992 // arguments to span multiple lines.
IsImplicitContinuation() const993 bool Prescanner::IsImplicitContinuation() const {
994 return !inPreprocessorDirective_ && !inCharLiteral_ &&
995 delimiterNesting_ > 0 && !IsAtEnd() &&
996 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
997 }
998
Continuation(bool mightNeedFixedFormSpace)999 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1000 if (*at_ == '\n' || *at_ == '&') {
1001 if (inFixedForm_) {
1002 return FixedFormContinuation(mightNeedFixedFormSpace);
1003 } else {
1004 return FreeFormContinuation();
1005 }
1006 } else {
1007 return false;
1008 }
1009 }
1010
1011 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1012 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1013 const char *p{start};
1014 char col1{*p++};
1015 if (!IsFixedFormCommentChar(col1)) {
1016 return std::nullopt;
1017 }
1018 char sentinel[5], *sp{sentinel};
1019 int column{2};
1020 for (; column < 6; ++column, ++p) {
1021 if (*p != ' ') {
1022 if (*p == '\n' || *p == '\t') {
1023 break;
1024 }
1025 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1026 // OpenMP conditional compilation line: leave the label alone
1027 break;
1028 }
1029 *sp++ = ToLowerCaseLetter(*p);
1030 }
1031 }
1032 if (column == 6) {
1033 if (*p == ' ' || *p == '\t' || *p == '0') {
1034 ++p;
1035 } else {
1036 // This is a Continuation line, not an initial directive line.
1037 return std::nullopt;
1038 }
1039 }
1040 if (sp == sentinel) {
1041 return std::nullopt;
1042 }
1043 *sp = '\0';
1044 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1045 std::size_t payloadOffset = p - start;
1046 return {LineClassification{
1047 LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1048 }
1049 return std::nullopt;
1050 }
1051
1052 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1053 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1054 char sentinel[8];
1055 const char *p{SkipWhiteSpace(start)};
1056 if (*p++ != '!') {
1057 return std::nullopt;
1058 }
1059 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1060 if (*p == '\n') {
1061 break;
1062 }
1063 if (*p == ' ' || *p == '\t' || *p == '&') {
1064 if (j == 0) {
1065 break;
1066 }
1067 sentinel[j] = '\0';
1068 p = SkipWhiteSpace(p + 1);
1069 if (*p == '!') {
1070 break;
1071 }
1072 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1073 std::size_t offset = p - start;
1074 return {LineClassification{
1075 LineClassification::Kind::CompilerDirective, offset, sp}};
1076 }
1077 break;
1078 }
1079 sentinel[j] = ToLowerCaseLetter(*p);
1080 }
1081 return std::nullopt;
1082 }
1083
AddCompilerDirectiveSentinel(const std::string & dir)1084 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1085 std::uint64_t packed{0};
1086 for (char ch : dir) {
1087 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1088 }
1089 compilerDirectiveBloomFilter_.set(packed % prime1);
1090 compilerDirectiveBloomFilter_.set(packed % prime2);
1091 compilerDirectiveSentinels_.insert(dir);
1092 return *this;
1093 }
1094
IsCompilerDirectiveSentinel(const char * sentinel) const1095 const char *Prescanner::IsCompilerDirectiveSentinel(
1096 const char *sentinel) const {
1097 std::uint64_t packed{0};
1098 std::size_t n{0};
1099 for (; sentinel[n] != '\0'; ++n) {
1100 packed = (packed << 8) | (sentinel[n] & 0xff);
1101 }
1102 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1103 !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1104 return nullptr;
1105 }
1106 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1107 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1108 }
1109
IsDirective(const char * match,const char * dir)1110 constexpr bool IsDirective(const char *match, const char *dir) {
1111 for (; *match; ++match) {
1112 if (*match != ToLowerCaseLetter(*dir++)) {
1113 return false;
1114 }
1115 }
1116 return true;
1117 }
1118
ClassifyLine(const char * start) const1119 Prescanner::LineClassification Prescanner::ClassifyLine(
1120 const char *start) const {
1121 if (inFixedForm_) {
1122 if (std::optional<LineClassification> lc{
1123 IsFixedFormCompilerDirectiveLine(start)}) {
1124 return std::move(*lc);
1125 }
1126 if (IsFixedFormCommentLine(start)) {
1127 return {LineClassification::Kind::Comment};
1128 }
1129 } else {
1130 if (std::optional<LineClassification> lc{
1131 IsFreeFormCompilerDirectiveLine(start)}) {
1132 return std::move(*lc);
1133 }
1134 if (const char *bang{IsFreeFormComment(start)}) {
1135 return {LineClassification::Kind::Comment,
1136 static_cast<std::size_t>(bang - start)};
1137 }
1138 }
1139 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1140 return {LineClassification::Kind::IncludeLine, *quoteOffset};
1141 }
1142 if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1143 if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1144 IsDirective("else", dir) || IsDirective("endif", dir)) {
1145 return {LineClassification::Kind::ConditionalCompilationDirective};
1146 } else if (IsDirective("include", dir)) {
1147 return {LineClassification::Kind::IncludeDirective};
1148 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1149 return {LineClassification::Kind::DefinitionDirective};
1150 } else {
1151 return {LineClassification::Kind::PreprocessorDirective};
1152 }
1153 }
1154 return {LineClassification::Kind::Source};
1155 }
1156
SourceFormChange(std::string && dir)1157 void Prescanner::SourceFormChange(std::string &&dir) {
1158 if (dir == "!dir$ free") {
1159 inFixedForm_ = false;
1160 } else if (dir == "!dir$ fixed") {
1161 inFixedForm_ = true;
1162 }
1163 }
1164 } // namespace Fortran::parser
1165