1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include <stdio.h>
5 #include <string>
6 #include <stdlib.h>
7 #include <errno.h>
8 #include <string.h>
9 #include <iostream>
10 #include <fstream>
11
12 // We only use U8_* macros, which are entirely inline.
13 #include "unicode/utf8.h"
14
15 // This contains a codepage and ISO 14882:1998 illegality table.
16 // Use "make gen-table" to rebuild it.
17 #include "cptbl.h"
18
19 /**
20 * What is this?
21 *
22 * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code
23 * in utf-8 into something consumable by certain compilers (Solaris, xlC)
24 * which aren't quite standards compliant.
25 *
26 * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN'
27 * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc.
28 * (some compilers do not support the u8 prefix correctly.)
29 * - if the system is EBCDIC-based, that is used to correct the input characters.
30 *
31 * Usage:
32 * escapesrc infile.cpp outfile.cpp
33 * Normally this is invoked by the build stage, with a rule such as:
34 *
35 * _%.cpp: $(srcdir)/%.cpp
36 * @$(BINDIR)/escapesrc$(EXEEXT) $< $@
37 * %.o: _%.cpp
38 * $(COMPILE.cc) ... $@ $<
39 *
40 * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp
41 * from being itself escaped.
42 */
43
44
45 static const char
46 kSPACE = 0x20,
47 kTAB = 0x09,
48 kLF = 0x0A,
49 kCR = 0x0D;
50
51 // For convenience
52 # define cp1047_to_8859(c) cp1047_8859_1[c]
53
54 // Our app's name
55 std::string prog;
56
57 /**
58 * Give the usual 1-line documentation and exit
59 */
usage()60 void usage() {
61 fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
62 }
63
64 /**
65 * Delete the output file (if any)
66 * We want to delete even if we didn't generate, because it might be stale.
67 */
cleanup(const std::string & outfile)68 int cleanup(const std::string &outfile) {
69 const char *outstr = outfile.c_str();
70 if(outstr && *outstr) {
71 int rc = std::remove(outstr);
72 if(rc == 0) {
73 fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
74 return 0;
75 } else {
76 if( errno == ENOENT ) {
77 return 0; // File did not exist - no error.
78 } else {
79 perror("std::remove");
80 return 1;
81 }
82 }
83 }
84 return 0;
85 }
86
87 /**
88 * Skip across any known whitespace.
89 * @param p startpoint
90 * @param e limit
91 * @return first non-whitespace char
92 */
skipws(const char * p,const char * e)93 inline const char *skipws(const char *p, const char *e) {
94 for(;p<e;p++) {
95 switch(*p) {
96 case kSPACE:
97 case kTAB:
98 case kLF:
99 case kCR:
100 break;
101 default:
102 return p; // non ws
103 }
104 }
105 return p;
106 }
107
108 /**
109 * Append a byte, hex encoded
110 * @param outstr sstring to append to
111 * @param byte the byte to append
112 */
appendByte(std::string & outstr,uint8_t byte)113 void appendByte(std::string &outstr,
114 uint8_t byte) {
115 char tmp2[5];
116 sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
117 outstr += tmp2;
118 }
119
120 /**
121 * Append the bytes from 'linestr' into outstr, with escaping
122 * @param outstr the output buffer
123 * @param linestr the input buffer
124 * @param pos in/out: the current char under consideration
125 * @param chars the number of chars to consider
126 * @return true on failure
127 */
appendUtf8(std::string & outstr,const std::string & linestr,size_t & pos,size_t chars)128 bool appendUtf8(std::string &outstr,
129 const std::string &linestr,
130 size_t &pos,
131 size_t chars) {
132 char tmp[9];
133 for(size_t i=0;i<chars;i++) {
134 tmp[i] = linestr[++pos];
135 }
136 tmp[chars] = 0;
137 unsigned int c;
138 sscanf(tmp, "%X", &c);
139 UChar32 ch = c & 0x1FFFFF;
140
141 // now to append \\x%% etc
142 uint8_t bytesNeeded = U8_LENGTH(ch);
143 if(bytesNeeded == 0) {
144 fprintf(stderr, "Illegal code point U+%X\n", ch);
145 return true;
146 }
147 uint8_t bytes[4];
148 uint8_t *s = bytes;
149 size_t i = 0;
150 U8_APPEND_UNSAFE(s, i, ch);
151 for(size_t t = 0; t<i; t++) {
152 appendByte(outstr, s[t]);
153 }
154 return false;
155 }
156
157 /**
158 * Fixup u8"x"
159 * @param linestr string to mutate. Already escaped into \u format.
160 * @param origpos beginning, points to 'u8"'
161 * @param pos end, points to "
162 * @return false for no-problem, true for failure!
163 */
fixu8(std::string & linestr,size_t origpos,size_t & endpos)164 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
165 size_t pos = origpos + 3;
166 std::string outstr;
167 outstr += '\"'; // local encoding
168 for(;pos<endpos;pos++) {
169 char c = linestr[pos];
170 if(c == '\\') {
171 char c2 = linestr[++pos];
172 switch(c2) {
173 case '\'':
174 case '"':
175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
176 c2 = cp1047_to_8859(c2);
177 #endif
178 appendByte(outstr, c2);
179 break;
180 case 'u':
181 appendUtf8(outstr, linestr, pos, 4);
182 break;
183 case 'U':
184 appendUtf8(outstr, linestr, pos, 8);
185 break;
186 }
187 } else {
188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
189 c = cp1047_to_8859(c);
190 #endif
191 appendByte(outstr, c);
192 }
193 }
194 outstr += ('\"');
195
196 linestr.replace(origpos, (endpos-origpos+1), outstr);
197
198 return false; // OK
199 }
200
201 /**
202 * fix the u"x"/u'x'/u8"x" string at the position
203 * u8'x' is not supported, sorry.
204 * @param linestr the input string
205 * @param pos the position
206 * @return false = no err, true = had err
207 */
fixAt(std::string & linestr,size_t pos)208 bool fixAt(std::string &linestr, size_t pos) {
209 size_t origpos = pos;
210
211 if(linestr[pos] != 'u') {
212 fprintf(stderr, "Not a 'u'?");
213 return true;
214 }
215
216 pos++; // past 'u'
217
218 bool utf8 = false;
219
220 if(linestr[pos] == '8') { // u8"
221 utf8 = true;
222 pos++;
223 }
224
225 char quote = linestr[pos];
226
227 if(quote != '\'' && quote != '\"') {
228 fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
229 return true;
230 }
231
232 if(quote == '\'' && utf8) {
233 fprintf(stderr, "Cannot do u8'...'\n");
234 return true;
235 }
236
237 pos ++;
238
239 //printf("u%c…%c\n", quote, quote);
240
241 for(; pos < linestr.size(); pos++) {
242 if(linestr[pos] == quote) {
243 if(utf8) {
244 return fixu8(linestr, origpos, pos); // fix u8"..."
245 } else {
246 return false; // end of quote
247 }
248 }
249 if(linestr[pos] == '\\') {
250 pos++;
251 if(linestr[pos] == quote) continue; // quoted quote
252 if(linestr[pos] == 'u') continue; // for now ... unicode escape
253 if(linestr[pos] == '\\') continue;
254 // some other escape… ignore
255 } else {
256 size_t old_pos = pos;
257 int32_t i = pos;
258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
259 // mogrify 1-4 bytes from 1047 'back' to utf-8
260 char old_byte = linestr[pos];
261 linestr[pos] = cp1047_to_8859(linestr[pos]);
262 // how many more?
263 int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
264 for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
265 linestr[pos2] = cp1047_to_8859(linestr[pos2]);
266 if(linestr[pos2] == 0x0A) {
267 linestr[pos2] = 0x85; // NL is ambiguous here
268 }
269 }
270 #endif
271
272 // Proceed to decode utf-8
273 const uint8_t *s = (const uint8_t*) (linestr.c_str());
274 int32_t length = linestr.size();
275 UChar32 c;
276 if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
278 linestr[pos] = old_byte; // put it back
279 #endif
280 continue; // single code point not previously legal for \u escaping
281 }
282
283 // otherwise, convert it to \u / \U
284 {
285 U8_NEXT(s, i, length, c);
286 }
287 if(c<0) {
288 fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos);
289 fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
290 return true;
291 }
292
293 size_t seqLen = (i-pos);
294
295 //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
296
297 char newSeq[20];
298 if( c <= 0xFFFF) {
299 sprintf(newSeq, "\\u%04X", c);
300 } else {
301 sprintf(newSeq, "\\U%08X", c);
302 }
303 linestr.replace(pos, seqLen, newSeq);
304 pos += strlen(newSeq) - 1;
305 }
306 }
307
308 return false;
309 }
310
311 /**
312 * Fixup an entire line
313 * false = no err
314 * true = had err
315 * @param no the line number (not used)
316 * @param linestr the string to fix
317 * @return true if any err, else false
318 */
fixLine(int,std::string & linestr)319 bool fixLine(int /*no*/, std::string &linestr) {
320 const char *line = linestr.c_str();
321 size_t len = linestr.size();
322
323 // no u' in the line?
324 if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
325 return false; // Nothing to do. No u' or u" detected
326 }
327
328 // start from the end and find all u" cases
329 size_t pos = len = linestr.size();
330 if(len>INT32_MAX/2) {
331 return true;
332 }
333 while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
334 //printf("found doublequote at %d\n", pos);
335 if(fixAt(linestr, pos)) return true;
336 if(pos == 0) break;
337 pos--;
338 }
339
340 // reset and find all u' cases
341 pos = len = linestr.size();
342 while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
343 //printf("found singlequote at %d\n", pos);
344 if(fixAt(linestr, pos)) return true;
345 if(pos == 0) break;
346 pos--;
347 }
348
349 // reset and find all u8" cases
350 pos = len = linestr.size();
351 while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
352 if(fixAt(linestr, pos)) return true;
353 if(pos == 0) break;
354 pos--;
355 }
356
357 //fprintf(stderr, "%d - fixed\n", no);
358 return false;
359 }
360
361 /**
362 * Convert a whole file
363 * @param infile
364 * @param outfile
365 * @return 1 on err, 0 otherwise
366 */
convert(const std::string & infile,const std::string & outfile)367 int convert(const std::string &infile, const std::string &outfile) {
368 fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
369
370 std::ifstream inf;
371
372 inf.open(infile.c_str(), std::ios::in);
373
374 if(!inf.is_open()) {
375 fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
376 cleanup(outfile);
377 return 1;
378 }
379
380 std::ofstream outf;
381
382 outf.open(outfile.c_str(), std::ios::out);
383
384 if(!outf.is_open()) {
385 fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
386 return 1;
387 }
388
389 // TODO: any platform variations of #line?
390 outf << "#line 1 \"" << infile << "\"" << '\n';
391
392 int no = 0;
393 std::string linestr;
394 while( getline( inf, linestr)) {
395 no++;
396 if(fixLine(no, linestr)) {
397 goto fail;
398 }
399 outf << linestr << '\n';
400 }
401
402 if(inf.eof()) {
403 return 0;
404 }
405 fail:
406 outf.close();
407 fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
408 cleanup(outfile);
409 return 1;
410 }
411
412 /**
413 * Main function
414 */
main(int argc,const char * argv[])415 int main(int argc, const char *argv[]) {
416 prog = argv[0];
417
418 if(argc != 3) {
419 usage();
420 return 1;
421 }
422
423 std::string infile = argv[1];
424 std::string outfile = argv[2];
425
426 return convert(infile, outfile);
427 }
428