• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       encoding_{allSources_.encoding()} {}
33 
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35     : messages_{that.messages_}, cooked_{that.cooked_},
36       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37       features_{that.features_}, inFixedForm_{that.inFixedForm_},
38       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40                                      1},
41       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44 
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48 
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50   char *p{dir.GetMutableCharData()};
51   char *limit{p + dir.SizeInChars()};
52   for (; p < limit; ++p) {
53     if (*p != ' ') {
54       CHECK(IsFixedFormCommentChar(*p));
55       *p = '!';
56       return;
57     }
58   }
59   DIE("compiler directive all blank");
60 }
61 
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63   startProvenance_ = range.start();
64   start_ = allSources_.GetSource(range);
65   CHECK(start_);
66   limit_ = start_ + range.size();
67   nextLine_ = start_;
68   const bool beganInFixedForm{inFixedForm_};
69   if (prescannerNesting_ > maxPrescannerNesting) {
70     Say(GetProvenance(start_),
71         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72     return;
73   }
74   while (!IsAtEnd()) {
75     Statement();
76   }
77   if (inFixedForm_ != beganInFixedForm) {
78     std::string dir{"!dir$ "};
79     if (beganInFixedForm) {
80       dir += "fixed";
81     } else {
82       dir += "free";
83     }
84     dir += '\n';
85     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86     tokens.Emit(cooked_);
87   }
88 }
89 
Statement()90 void Prescanner::Statement() {
91   TokenSequence tokens;
92   LineClassification line{ClassifyLine(nextLine_)};
93   switch (line.kind) {
94   case LineClassification::Kind::Comment:
95     nextLine_ += line.payloadOffset; // advance to '!' or newline
96     NextLine();
97     return;
98   case LineClassification::Kind::IncludeLine:
99     FortranInclude(nextLine_ + line.payloadOffset);
100     NextLine();
101     return;
102   case LineClassification::Kind::ConditionalCompilationDirective:
103   case LineClassification::Kind::IncludeDirective:
104   case LineClassification::Kind::DefinitionDirective:
105   case LineClassification::Kind::PreprocessorDirective:
106     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
107     return;
108   case LineClassification::Kind::CompilerDirective:
109     directiveSentinel_ = line.sentinel;
110     CHECK(InCompilerDirective());
111     BeginStatementAndAdvance();
112     if (inFixedForm_) {
113       CHECK(IsFixedFormCommentChar(*at_));
114     } else {
115       while (*at_ == ' ' || *at_ == '\t') {
116         ++at_, ++column_;
117       }
118       CHECK(*at_ == '!');
119     }
120     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121       // OpenMP conditional compilation line.  Remove the sentinel and then
122       // treat the line as if it were normal source.
123       at_ += 2, column_ += 2;
124       if (inFixedForm_) {
125         LabelField(tokens);
126       } else {
127         SkipSpaces();
128       }
129     } else {
130       // Compiler directive.  Emit normalized sentinel.
131       EmitChar(tokens, '!');
132       ++at_, ++column_;
133       for (const char *sp{directiveSentinel_}; *sp != '\0';
134            ++sp, ++at_, ++column_) {
135         EmitChar(tokens, *sp);
136       }
137       if (*at_ == ' ') {
138         EmitChar(tokens, ' ');
139         ++at_, ++column_;
140       }
141       tokens.CloseToken();
142     }
143     break;
144   case LineClassification::Kind::Source:
145     BeginStatementAndAdvance();
146     if (inFixedForm_) {
147       LabelField(tokens);
148     } else if (skipLeadingAmpersand_) {
149       skipLeadingAmpersand_ = false;
150       const char *p{SkipWhiteSpace(at_)};
151       if (p < limit_ && *p == '&') {
152         column_ += ++p - at_;
153         at_ = p;
154       }
155     } else {
156       SkipSpaces();
157     }
158     break;
159   }
160 
161   while (NextToken(tokens)) {
162   }
163 
164   Provenance newlineProvenance{GetCurrentProvenance()};
165   if (std::optional<TokenSequence> preprocessed{
166           preprocessor_.MacroReplacement(tokens, *this)}) {
167     // Reprocess the preprocessed line.  Append a newline temporarily.
168     preprocessed->PutNextTokenChar('\n', newlineProvenance);
169     preprocessed->CloseToken();
170     const char *ppd{preprocessed->ToCharBlock().begin()};
171     LineClassification ppl{ClassifyLine(ppd)};
172     preprocessed->RemoveLastToken(); // remove the newline
173     switch (ppl.kind) {
174     case LineClassification::Kind::Comment:
175       break;
176     case LineClassification::Kind::IncludeLine:
177       FortranInclude(ppd + ppl.payloadOffset);
178       break;
179     case LineClassification::Kind::ConditionalCompilationDirective:
180     case LineClassification::Kind::IncludeDirective:
181     case LineClassification::Kind::DefinitionDirective:
182     case LineClassification::Kind::PreprocessorDirective:
183       Say(preprocessed->GetProvenanceRange(),
184           "Preprocessed line resembles a preprocessor directive"_en_US);
185       preprocessed->ToLowerCase().CheckBadFortranCharacters(messages_).Emit(
186           cooked_);
187       break;
188     case LineClassification::Kind::CompilerDirective:
189       if (preprocessed->HasRedundantBlanks()) {
190         preprocessed->RemoveRedundantBlanks();
191       }
192       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
193       preprocessed->ToLowerCase();
194       SourceFormChange(preprocessed->ToString());
195       preprocessed->ClipComment(true /* skip first ! */)
196           .CheckBadFortranCharacters(messages_)
197           .Emit(cooked_);
198       break;
199     case LineClassification::Kind::Source:
200       if (inFixedForm_) {
201         if (preprocessed->HasBlanks(/*after column*/ 6)) {
202           preprocessed->RemoveBlanks(/*after column*/ 6);
203         }
204       } else {
205         if (preprocessed->HasRedundantBlanks()) {
206           preprocessed->RemoveRedundantBlanks();
207         }
208       }
209       preprocessed->ToLowerCase()
210           .ClipComment()
211           .CheckBadFortranCharacters(messages_)
212           .Emit(cooked_);
213       break;
214     }
215   } else {
216     tokens.ToLowerCase();
217     if (line.kind == LineClassification::Kind::CompilerDirective) {
218       SourceFormChange(tokens.ToString());
219     }
220     tokens.CheckBadFortranCharacters(messages_).Emit(cooked_);
221   }
222   if (omitNewline_) {
223     omitNewline_ = false;
224   } else {
225     cooked_.Put('\n', newlineProvenance);
226   }
227   directiveSentinel_ = nullptr;
228 }
229 
TokenizePreprocessorDirective()230 TokenSequence Prescanner::TokenizePreprocessorDirective() {
231   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
232   inPreprocessorDirective_ = true;
233   BeginStatementAndAdvance();
234   TokenSequence tokens;
235   while (NextToken(tokens)) {
236   }
237   inPreprocessorDirective_ = false;
238   return tokens;
239 }
240 
NextLine()241 void Prescanner::NextLine() {
242   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
243   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
244   if (!v) {
245     nextLine_ = limit_;
246   } else {
247     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
248     nextLine_ = nl + 1;
249   }
250 }
251 
LabelField(TokenSequence & token)252 void Prescanner::LabelField(TokenSequence &token) {
253   const char *bad{nullptr};
254   int outCol{1};
255   for (; *at_ != '\n' && column_ <= 6; ++at_) {
256     if (*at_ == '\t') {
257       ++at_;
258       column_ = 7;
259       break;
260     }
261     if (*at_ != ' ' &&
262         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
263       EmitChar(token, *at_);
264       ++outCol;
265       if (!bad && !IsDecimalDigit(*at_)) {
266         bad = at_;
267       }
268     }
269     ++column_;
270   }
271   if (outCol == 1) { // empty label field
272     // Emit a space so that, if the line is rescanned after preprocessing,
273     // a leading 'C' or 'D' won't be left-justified and then accidentally
274     // misinterpreted as a comment card.
275     EmitChar(token, ' ');
276     ++outCol;
277   } else {
278     if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
279       Say(GetProvenance(bad),
280           "Character in fixed-form label field must be a digit"_en_US);
281     }
282   }
283   token.CloseToken();
284   SkipToNextSignificantCharacter();
285   if (IsDecimalDigit(*at_)) {
286     Say(GetProvenance(at_),
287         "Label digit is not in fixed-form label field"_en_US);
288   }
289 }
290 
SkipToEndOfLine()291 void Prescanner::SkipToEndOfLine() {
292   while (*at_ != '\n') {
293     ++at_, ++column_;
294   }
295 }
296 
MustSkipToEndOfLine() const297 bool Prescanner::MustSkipToEndOfLine() const {
298   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
299     return true; // skip over ignored columns in right margin (73:80)
300   } else if (*at_ == '!' && !inCharLiteral_) {
301     return true; // inline comment goes to end of source line
302   } else {
303     return false;
304   }
305 }
306 
NextChar()307 void Prescanner::NextChar() {
308   CHECK(*at_ != '\n');
309   ++at_, ++column_;
310   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
311     // UTF-8 byte order mark - treat this file as UTF-8
312     at_ += 3;
313     encoding_ = Encoding::UTF_8;
314   }
315   SkipToNextSignificantCharacter();
316 }
317 
318 // Skip everything that should be ignored until the next significant
319 // character is reached; handles C-style comments in preprocessing
320 // directives, Fortran ! comments, stuff after the right margin in
321 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()322 void Prescanner::SkipToNextSignificantCharacter() {
323   if (inPreprocessorDirective_) {
324     SkipCComments();
325   } else {
326     bool mightNeedSpace{false};
327     if (MustSkipToEndOfLine()) {
328       SkipToEndOfLine();
329     } else {
330       mightNeedSpace = *at_ == '\n';
331     }
332     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
333       if (MustSkipToEndOfLine()) {
334         SkipToEndOfLine();
335       }
336     }
337     if (*at_ == '\t') {
338       tabInCurrentLine_ = true;
339     }
340   }
341 }
342 
SkipCComments()343 void Prescanner::SkipCComments() {
344   while (true) {
345     if (IsCComment(at_)) {
346       if (const char *after{SkipCComment(at_)}) {
347         column_ += after - at_;
348         // May have skipped over one or more newlines; relocate the start of
349         // the next line.
350         nextLine_ = at_ = after;
351         NextLine();
352       } else {
353         // Don't emit any messages about unclosed C-style comments, because
354         // the sequence /* can appear legally in a FORMAT statement.  There's
355         // no ambiguity, since the sequence */ cannot appear legally.
356         break;
357       }
358     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
359         at_[1] == '\n' && !IsAtEnd()) {
360       BeginSourceLineAndAdvance();
361     } else {
362       break;
363     }
364   }
365 }
366 
SkipSpaces()367 void Prescanner::SkipSpaces() {
368   while (*at_ == ' ' || *at_ == '\t') {
369     NextChar();
370   }
371   insertASpace_ = false;
372 }
373 
SkipWhiteSpace(const char * p)374 const char *Prescanner::SkipWhiteSpace(const char *p) {
375   while (*p == ' ' || *p == '\t') {
376     ++p;
377   }
378   return p;
379 }
380 
SkipWhiteSpaceAndCComments(const char * p) const381 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
382   while (true) {
383     if (*p == ' ' || *p == '\t') {
384       ++p;
385     } else if (IsCComment(p)) {
386       if (const char *after{SkipCComment(p)}) {
387         p = after;
388       } else {
389         break;
390       }
391     } else {
392       break;
393     }
394   }
395   return p;
396 }
397 
SkipCComment(const char * p) const398 const char *Prescanner::SkipCComment(const char *p) const {
399   char star{' '}, slash{' '};
400   p += 2;
401   while (star != '*' || slash != '/') {
402     if (p >= limit_) {
403       return nullptr; // signifies an unterminated comment
404     }
405     star = slash;
406     slash = *p++;
407   }
408   return p;
409 }
410 
NextToken(TokenSequence & tokens)411 bool Prescanner::NextToken(TokenSequence &tokens) {
412   CHECK(at_ >= start_ && at_ < limit_);
413   if (InFixedFormSource()) {
414     SkipSpaces();
415   } else {
416     if (*at_ == '/' && IsCComment(at_)) {
417       // Recognize and skip over classic C style /*comments*/ when
418       // outside a character literal.
419       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
420         Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
421       }
422       SkipCComments();
423     }
424     if (*at_ == ' ' || *at_ == '\t') {
425       // Compress free-form white space into a single space character.
426       const auto theSpace{at_};
427       char previous{at_ <= start_ ? ' ' : at_[-1]};
428       NextChar();
429       SkipSpaces();
430       if (*at_ == '\n') {
431         // Discard white space at the end of a line.
432       } else if (!inPreprocessorDirective_ &&
433           (previous == '(' || *at_ == '(' || *at_ == ')')) {
434         // Discard white space before/after '(' and before ')', unless in a
435         // preprocessor directive.  This helps yield space-free contiguous
436         // names for generic interfaces like OPERATOR( + ) and
437         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
438         // This has the effect of silently ignoring the illegal spaces in
439         // the array constructor ( /1,2/ ) but that seems benign; it's
440         // hard to avoid that while still removing spaces from OPERATOR( / )
441         // and OPERATOR( // ).
442       } else {
443         // Preserve the squashed white space as a single space character.
444         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
445         tokens.CloseToken();
446         return true;
447       }
448     }
449   }
450   if (insertASpace_) {
451     tokens.PutNextTokenChar(' ', spaceProvenance_);
452     insertASpace_ = false;
453   }
454   if (*at_ == '\n') {
455     return false;
456   }
457   const char *start{at_};
458   if (*at_ == '\'' || *at_ == '"') {
459     QuotedCharacterLiteral(tokens, start);
460     preventHollerith_ = false;
461   } else if (IsDecimalDigit(*at_)) {
462     int n{0}, digits{0};
463     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
464     do {
465       if (n < maxHollerith) {
466         n = 10 * n + DecimalDigitValue(*at_);
467       }
468       EmitCharAndAdvance(tokens, *at_);
469       ++digits;
470       if (InFixedFormSource()) {
471         SkipSpaces();
472       }
473     } while (IsDecimalDigit(*at_));
474     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
475         !preventHollerith_) {
476       Hollerith(tokens, n, start);
477     } else if (*at_ == '.') {
478       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
479       }
480       ExponentAndKind(tokens);
481     } else if (ExponentAndKind(tokens)) {
482     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
483         inPreprocessorDirective_) {
484       do {
485         EmitCharAndAdvance(tokens, *at_);
486       } while (IsHexadecimalDigit(*at_));
487     } else if (IsLetter(*at_)) {
488       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
489       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
490       EmitCharAndAdvance(tokens, *at_);
491     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
492       EmitCharAndAdvance(tokens, *at_);
493       QuotedCharacterLiteral(tokens, start);
494     }
495     preventHollerith_ = false;
496   } else if (*at_ == '.') {
497     char nch{EmitCharAndAdvance(tokens, '.')};
498     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
499       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
500       }
501       ExponentAndKind(tokens);
502     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
503       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
504     }
505     preventHollerith_ = false;
506   } else if (IsLegalInIdentifier(*at_)) {
507     do {
508     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
509     if ((*at_ == '\'' || *at_ == '"') &&
510         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
511       QuotedCharacterLiteral(tokens, start);
512     }
513     preventHollerith_ = false;
514   } else if (*at_ == '*') {
515     if (EmitCharAndAdvance(tokens, '*') == '*') {
516       EmitCharAndAdvance(tokens, '*');
517     } else {
518       // Subtle ambiguity:
519       //  CHARACTER*2H     declares H because *2 is a kind specifier
520       //  DATAC/N*2H  /    is repeated Hollerith
521       preventHollerith_ = !slashInCurrentStatement_;
522     }
523   } else {
524     char ch{*at_};
525     if (ch == '(' || ch == '[') {
526       ++delimiterNesting_;
527     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
528       --delimiterNesting_;
529     }
530     char nch{EmitCharAndAdvance(tokens, ch)};
531     preventHollerith_ = false;
532     if ((nch == '=' &&
533             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
534         (ch == nch &&
535             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
536                 ch == '|' || ch == '<' || ch == '>')) ||
537         (ch == '=' && nch == '>')) {
538       // token comprises two characters
539       EmitCharAndAdvance(tokens, nch);
540     } else if (ch == '/') {
541       slashInCurrentStatement_ = true;
542     }
543   }
544   tokens.CloseToken();
545   return true;
546 }
547 
ExponentAndKind(TokenSequence & tokens)548 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
549   char ed{ToLowerCaseLetter(*at_)};
550   if (ed != 'e' && ed != 'd') {
551     return false;
552   }
553   EmitCharAndAdvance(tokens, ed);
554   if (*at_ == '+' || *at_ == '-') {
555     EmitCharAndAdvance(tokens, *at_);
556   }
557   while (IsDecimalDigit(*at_)) {
558     EmitCharAndAdvance(tokens, *at_);
559   }
560   if (*at_ == '_') {
561     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
562     }
563   }
564   return true;
565 }
566 
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)567 void Prescanner::QuotedCharacterLiteral(
568     TokenSequence &tokens, const char *start) {
569   char quote{*at_};
570   const char *end{at_ + 1};
571   inCharLiteral_ = true;
572   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
573   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
574   bool isEscaped{false};
575   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
576   while (true) {
577     if (*at_ == '\\') {
578       if (escapesEnabled) {
579         isEscaped = !isEscaped;
580       } else {
581         // The parser always processes escape sequences, so don't confuse it
582         // when escapes are disabled.
583         insert('\\');
584       }
585     } else {
586       isEscaped = false;
587     }
588     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
589         Encoding::LATIN_1);
590     while (PadOutCharacterLiteral(tokens)) {
591     }
592     if (*at_ == '\n') {
593       if (!inPreprocessorDirective_) {
594         Say(GetProvenanceRange(start, end),
595             "Incomplete character literal"_err_en_US);
596       }
597       break;
598     }
599     end = at_ + 1;
600     NextChar();
601     if (*at_ == quote && !isEscaped) {
602       // A doubled unescaped quote mark becomes a single instance of that
603       // quote character in the literal (later).  There can be spaces between
604       // the quotes in fixed form source.
605       EmitChar(tokens, quote);
606       inCharLiteral_ = false; // for cases like print *, '...'!comment
607       NextChar();
608       if (InFixedFormSource()) {
609         SkipSpaces();
610       }
611       if (*at_ != quote) {
612         break;
613       }
614       inCharLiteral_ = true;
615     }
616   }
617   inCharLiteral_ = false;
618 }
619 
Hollerith(TokenSequence & tokens,int count,const char * start)620 void Prescanner::Hollerith(
621     TokenSequence &tokens, int count, const char *start) {
622   inCharLiteral_ = true;
623   CHECK(*at_ == 'h' || *at_ == 'H');
624   EmitChar(tokens, 'H');
625   while (count-- > 0) {
626     if (PadOutCharacterLiteral(tokens)) {
627     } else if (*at_ == '\n') {
628       Say(GetProvenanceRange(start, at_),
629           "Possible truncated Hollerith literal"_en_US);
630       break;
631     } else {
632       NextChar();
633       // Each multi-byte character encoding counts as a single character.
634       // No escape sequences are recognized.
635       // Hollerith is always emitted to the cooked character
636       // stream in UTF-8.
637       DecodedCharacter decoded{DecodeCharacter(
638           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
639       if (decoded.bytes > 0) {
640         EncodedCharacter utf8{
641             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
642         for (int j{0}; j < utf8.bytes; ++j) {
643           EmitChar(tokens, utf8.buffer[j]);
644         }
645         at_ += decoded.bytes - 1;
646       } else {
647         Say(GetProvenanceRange(start, at_),
648             "Bad character in Hollerith literal"_err_en_US);
649         break;
650       }
651     }
652   }
653   if (*at_ != '\n') {
654     NextChar();
655   }
656   inCharLiteral_ = false;
657 }
658 
659 // In fixed form, source card images must be processed as if they were at
660 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)661 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
662   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
663     if (column_ < fixedFormColumnLimit_) {
664       tokens.PutNextTokenChar(' ', spaceProvenance_);
665       ++column_;
666       return true;
667     }
668     if (!FixedFormContinuation(false /*no need to insert space*/) ||
669         tabInCurrentLine_) {
670       return false;
671     }
672     CHECK(column_ == 7);
673     --at_; // point to column 6 of continuation line
674     column_ = 6;
675   }
676   return false;
677 }
678 
IsFixedFormCommentLine(const char * start) const679 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
680   const char *p{start};
681   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
682       ((*p == 'D' || *p == 'd') &&
683           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
684     return true;
685   }
686   bool anyTabs{false};
687   while (true) {
688     if (*p == ' ') {
689       ++p;
690     } else if (*p == '\t') {
691       anyTabs = true;
692       ++p;
693     } else if (*p == '0' && !anyTabs && p == start + 5) {
694       ++p; // 0 in column 6 must treated as a space
695     } else {
696       break;
697     }
698   }
699   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
700     return true;
701   }
702   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
703     return true;
704   }
705   return *p == '\n';
706 }
707 
IsFreeFormComment(const char * p) const708 const char *Prescanner::IsFreeFormComment(const char *p) const {
709   p = SkipWhiteSpaceAndCComments(p);
710   if (*p == '!' || *p == '\n') {
711     return p;
712   } else {
713     return nullptr;
714   }
715 }
716 
IsIncludeLine(const char * start) const717 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
718   const char *p{SkipWhiteSpace(start)};
719   for (char ch : "include"s) {
720     if (ToLowerCaseLetter(*p++) != ch) {
721       return std::nullopt;
722     }
723   }
724   p = SkipWhiteSpace(p);
725   if (*p == '"' || *p == '\'') {
726     return {p - start};
727   }
728   return std::nullopt;
729 }
730 
FortranInclude(const char * firstQuote)731 void Prescanner::FortranInclude(const char *firstQuote) {
732   const char *p{firstQuote};
733   while (*p != '"' && *p != '\'') {
734     ++p;
735   }
736   char quote{*p};
737   std::string path;
738   for (++p; *p != '\n'; ++p) {
739     if (*p == quote) {
740       if (p[1] != quote) {
741         break;
742       }
743       ++p;
744     }
745     path += *p;
746   }
747   if (*p != quote) {
748     Say(GetProvenanceRange(firstQuote, p),
749         "malformed path name string"_err_en_US);
750     return;
751   }
752   p = SkipWhiteSpace(p + 1);
753   if (*p != '\n' && *p != '!') {
754     const char *garbage{p};
755     for (; *p != '\n' && *p != '!'; ++p) {
756     }
757     Say(GetProvenanceRange(garbage, p),
758         "excess characters after path name"_en_US);
759   }
760   std::string buf;
761   llvm::raw_string_ostream error{buf};
762   Provenance provenance{GetProvenance(nextLine_)};
763   const SourceFile *currentFile{allSources_.GetSourceFile(provenance)};
764   if (currentFile) {
765     allSources_.PushSearchPathDirectory(DirectoryName(currentFile->path()));
766   }
767   const SourceFile *included{allSources_.Open(path, error)};
768   if (currentFile) {
769     allSources_.PopSearchPathDirectory();
770   }
771   if (!included) {
772     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
773   } else if (included->bytes() > 0) {
774     ProvenanceRange includeLineRange{
775         provenance, static_cast<std::size_t>(p - nextLine_)};
776     ProvenanceRange fileRange{
777         allSources_.AddIncludedFile(*included, includeLineRange)};
778     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
779   }
780 }
781 
IsPreprocessorDirectiveLine(const char * start) const782 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
783   const char *p{start};
784   for (; *p == ' '; ++p) {
785   }
786   if (*p == '#') {
787     if (inFixedForm_ && p == start + 5) {
788       return nullptr;
789     }
790   } else {
791     p = SkipWhiteSpace(p);
792     if (*p != '#') {
793       return nullptr;
794     }
795   }
796   return SkipWhiteSpace(p + 1);
797 }
798 
IsNextLinePreprocessorDirective() const799 bool Prescanner::IsNextLinePreprocessorDirective() const {
800   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
801 }
802 
SkipCommentLine(bool afterAmpersand)803 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
804   if (IsAtEnd()) {
805     if (afterAmpersand && prescannerNesting_ > 0) {
806       // A continuation marker at the end of the last line in an
807       // include file inhibits the newline for that line.
808       SkipToEndOfLine();
809       omitNewline_ = true;
810     }
811     return false;
812   }
813   auto lineClass{ClassifyLine(nextLine_)};
814   if (lineClass.kind == LineClassification::Kind::Comment) {
815     NextLine();
816     return true;
817   } else if (inPreprocessorDirective_) {
818     return false;
819   } else if (lineClass.kind ==
820           LineClassification::Kind::ConditionalCompilationDirective ||
821       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
822     // Allow conditional compilation directives (e.g., #ifdef) to affect
823     // continuation lines.
824     // Allow other preprocessor directives, too, except #include
825     // (when it does not follow '&'), #define, and #undef (because
826     // they cannot be allowed to affect preceding text on a
827     // continued line).
828     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
829     return true;
830   } else if (afterAmpersand &&
831       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
832           lineClass.kind == LineClassification::Kind::IncludeLine)) {
833     SkipToEndOfLine();
834     omitNewline_ = true;
835     skipLeadingAmpersand_ = true;
836     return false;
837   } else {
838     return false;
839   }
840 }
841 
FixedFormContinuationLine(bool mightNeedSpace)842 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
843   if (IsAtEnd()) {
844     return nullptr;
845   }
846   tabInCurrentLine_ = false;
847   char col1{*nextLine_};
848   if (InCompilerDirective()) {
849     // Must be a continued compiler directive.
850     if (!IsFixedFormCommentChar(col1)) {
851       return nullptr;
852     }
853     int j{1};
854     for (; j < 5; ++j) {
855       char ch{directiveSentinel_[j - 1]};
856       if (ch == '\0') {
857         break;
858       }
859       if (ch != ToLowerCaseLetter(nextLine_[j])) {
860         return nullptr;
861       }
862     }
863     for (; j < 5; ++j) {
864       if (nextLine_[j] != ' ') {
865         return nullptr;
866       }
867     }
868     char col6{nextLine_[5]};
869     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
870       if (nextLine_[6] != ' ' && mightNeedSpace) {
871         insertASpace_ = true;
872       }
873       return nextLine_ + 6;
874     }
875     return nullptr;
876   } else {
877     // Normal case: not in a compiler directive.
878     if (col1 == '&' &&
879         features_.IsEnabled(
880             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
881       // Extension: '&' as continuation marker
882       if (features_.ShouldWarn(
883               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
884         Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
885       }
886       return nextLine_ + 1;
887     }
888     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
889       tabInCurrentLine_ = true;
890       return nextLine_ + 2; // VAX extension
891     }
892     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
893         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
894       char col6{nextLine_[5]};
895       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
896         return nextLine_ + 6;
897       }
898     }
899     if (IsImplicitContinuation()) {
900       return nextLine_;
901     }
902   }
903   return nullptr; // not a continuation line
904 }
905 
FreeFormContinuationLine(bool ampersand)906 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
907   const char *p{nextLine_};
908   if (p >= limit_) {
909     return nullptr;
910   }
911   p = SkipWhiteSpace(p);
912   if (InCompilerDirective()) {
913     if (*p++ != '!') {
914       return nullptr;
915     }
916     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
917       if (*s != ToLowerCaseLetter(*p)) {
918         return nullptr;
919       }
920     }
921     p = SkipWhiteSpace(p);
922     if (*p == '&') {
923       if (!ampersand) {
924         insertASpace_ = true;
925       }
926       return p + 1;
927     } else if (ampersand) {
928       return p;
929     } else {
930       return nullptr;
931     }
932   } else {
933     if (*p == '&') {
934       return p + 1;
935     } else if (*p == '!' || *p == '\n' || *p == '#') {
936       return nullptr;
937     } else if (ampersand || IsImplicitContinuation()) {
938       if (p > nextLine_) {
939         --p;
940       } else {
941         insertASpace_ = true;
942       }
943       return p;
944     } else {
945       return nullptr;
946     }
947   }
948 }
949 
FixedFormContinuation(bool mightNeedSpace)950 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
951   // N.B. We accept '&' as a continuation indicator in fixed form, too,
952   // but not in a character literal.
953   if (*at_ == '&' && inCharLiteral_) {
954     return false;
955   }
956   do {
957     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
958       BeginSourceLine(cont);
959       column_ = 7;
960       NextLine();
961       return true;
962     }
963   } while (SkipCommentLine(false /* not after ampersand */));
964   return false;
965 }
966 
FreeFormContinuation()967 bool Prescanner::FreeFormContinuation() {
968   const char *p{at_};
969   bool ampersand{*p == '&'};
970   if (ampersand) {
971     p = SkipWhiteSpace(p + 1);
972   }
973   if (*p != '\n') {
974     if (inCharLiteral_) {
975       return false;
976     } else if (*p != '!' &&
977         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
978       Say(GetProvenance(p), "missing ! before comment after &"_en_US);
979     }
980   }
981   do {
982     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
983       BeginSourceLine(cont);
984       NextLine();
985       return true;
986     }
987   } while (SkipCommentLine(ampersand));
988   return false;
989 }
990 
991 // Implicit line continuation allows a preprocessor macro call with
992 // arguments to span multiple lines.
IsImplicitContinuation() const993 bool Prescanner::IsImplicitContinuation() const {
994   return !inPreprocessorDirective_ && !inCharLiteral_ &&
995       delimiterNesting_ > 0 && !IsAtEnd() &&
996       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
997 }
998 
Continuation(bool mightNeedFixedFormSpace)999 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1000   if (*at_ == '\n' || *at_ == '&') {
1001     if (inFixedForm_) {
1002       return FixedFormContinuation(mightNeedFixedFormSpace);
1003     } else {
1004       return FreeFormContinuation();
1005     }
1006   } else {
1007     return false;
1008   }
1009 }
1010 
1011 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1012 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1013   const char *p{start};
1014   char col1{*p++};
1015   if (!IsFixedFormCommentChar(col1)) {
1016     return std::nullopt;
1017   }
1018   char sentinel[5], *sp{sentinel};
1019   int column{2};
1020   for (; column < 6; ++column, ++p) {
1021     if (*p != ' ') {
1022       if (*p == '\n' || *p == '\t') {
1023         break;
1024       }
1025       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1026         // OpenMP conditional compilation line: leave the label alone
1027         break;
1028       }
1029       *sp++ = ToLowerCaseLetter(*p);
1030     }
1031   }
1032   if (column == 6) {
1033     if (*p == ' ' || *p == '\t' || *p == '0') {
1034       ++p;
1035     } else {
1036       // This is a Continuation line, not an initial directive line.
1037       return std::nullopt;
1038     }
1039   }
1040   if (sp == sentinel) {
1041     return std::nullopt;
1042   }
1043   *sp = '\0';
1044   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1045     std::size_t payloadOffset = p - start;
1046     return {LineClassification{
1047         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1048   }
1049   return std::nullopt;
1050 }
1051 
1052 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1053 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1054   char sentinel[8];
1055   const char *p{SkipWhiteSpace(start)};
1056   if (*p++ != '!') {
1057     return std::nullopt;
1058   }
1059   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1060     if (*p == '\n') {
1061       break;
1062     }
1063     if (*p == ' ' || *p == '\t' || *p == '&') {
1064       if (j == 0) {
1065         break;
1066       }
1067       sentinel[j] = '\0';
1068       p = SkipWhiteSpace(p + 1);
1069       if (*p == '!') {
1070         break;
1071       }
1072       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1073         std::size_t offset = p - start;
1074         return {LineClassification{
1075             LineClassification::Kind::CompilerDirective, offset, sp}};
1076       }
1077       break;
1078     }
1079     sentinel[j] = ToLowerCaseLetter(*p);
1080   }
1081   return std::nullopt;
1082 }
1083 
AddCompilerDirectiveSentinel(const std::string & dir)1084 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1085   std::uint64_t packed{0};
1086   for (char ch : dir) {
1087     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1088   }
1089   compilerDirectiveBloomFilter_.set(packed % prime1);
1090   compilerDirectiveBloomFilter_.set(packed % prime2);
1091   compilerDirectiveSentinels_.insert(dir);
1092   return *this;
1093 }
1094 
IsCompilerDirectiveSentinel(const char * sentinel) const1095 const char *Prescanner::IsCompilerDirectiveSentinel(
1096     const char *sentinel) const {
1097   std::uint64_t packed{0};
1098   std::size_t n{0};
1099   for (; sentinel[n] != '\0'; ++n) {
1100     packed = (packed << 8) | (sentinel[n] & 0xff);
1101   }
1102   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1103       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1104     return nullptr;
1105   }
1106   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1107   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1108 }
1109 
IsDirective(const char * match,const char * dir)1110 constexpr bool IsDirective(const char *match, const char *dir) {
1111   for (; *match; ++match) {
1112     if (*match != ToLowerCaseLetter(*dir++)) {
1113       return false;
1114     }
1115   }
1116   return true;
1117 }
1118 
ClassifyLine(const char * start) const1119 Prescanner::LineClassification Prescanner::ClassifyLine(
1120     const char *start) const {
1121   if (inFixedForm_) {
1122     if (std::optional<LineClassification> lc{
1123             IsFixedFormCompilerDirectiveLine(start)}) {
1124       return std::move(*lc);
1125     }
1126     if (IsFixedFormCommentLine(start)) {
1127       return {LineClassification::Kind::Comment};
1128     }
1129   } else {
1130     if (std::optional<LineClassification> lc{
1131             IsFreeFormCompilerDirectiveLine(start)}) {
1132       return std::move(*lc);
1133     }
1134     if (const char *bang{IsFreeFormComment(start)}) {
1135       return {LineClassification::Kind::Comment,
1136           static_cast<std::size_t>(bang - start)};
1137     }
1138   }
1139   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1140     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1141   }
1142   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1143     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1144         IsDirective("else", dir) || IsDirective("endif", dir)) {
1145       return {LineClassification::Kind::ConditionalCompilationDirective};
1146     } else if (IsDirective("include", dir)) {
1147       return {LineClassification::Kind::IncludeDirective};
1148     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1149       return {LineClassification::Kind::DefinitionDirective};
1150     } else {
1151       return {LineClassification::Kind::PreprocessorDirective};
1152     }
1153   }
1154   return {LineClassification::Kind::Source};
1155 }
1156 
SourceFormChange(std::string && dir)1157 void Prescanner::SourceFormChange(std::string &&dir) {
1158   if (dir == "!dir$ free") {
1159     inFixedForm_ = false;
1160   } else if (dir == "!dir$ fixed") {
1161     inFixedForm_ = true;
1162   }
1163 }
1164 } // namespace Fortran::parser
1165