1 /*
2 * Copyright 2011 - 2015
3 * Andr\xe9 Malo or his licensors, as applicable
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #include "cext.h"
19 EXT_INIT_FUNC;
20
21 #define RJSMIN_DULL_BIT (1 << 0)
22 #define RJSMIN_PRE_REGEX_BIT (1 << 1)
23 #define RJSMIN_REGEX_DULL_BIT (1 << 2)
24 #define RJSMIN_REGEX_CC_DULL_BIT (1 << 3)
25 #define RJSMIN_ID_LIT_BIT (1 << 4)
26 #define RJSMIN_ID_LIT_O_BIT (1 << 5)
27 #define RJSMIN_ID_LIT_C_BIT (1 << 6)
28 #define RJSMIN_STRING_DULL_BIT (1 << 7)
29 #define RJSMIN_SPACE_BIT (1 << 8)
30 #define RJSMIN_POST_REGEX_OFF_BIT (1 << 9)
31
32 #ifdef EXT3
33 typedef Py_UNICODE rchar;
34 #else
35 typedef unsigned char rchar;
36 #endif
37 #define U(c) ((rchar)(c))
38
39 #define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
40 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))
41
42 #define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
43 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))
44
45 #define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
46 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))
47
48 #define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
49 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))
50
51 #define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
52 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))
53
54 #define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
55 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))
56
57 #define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
58 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))
59
60 #define RJSMIN_IS_POST_REGEX_OFF(c) ((U(c) > 127) || \
61 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_POST_REGEX_OFF_BIT))
62
63 #define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
64 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))
65
66 #define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
67 (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))
68
69
70 static const unsigned short rjsmin_charmask[128] = {
71 396, 396, 396, 396, 396, 396, 396, 396,
72 396, 396, 2, 396, 396, 2, 396, 396,
73 396, 396, 396, 396, 396, 396, 396, 396,
74 396, 396, 396, 396, 396, 396, 396, 396,
75 396, 687, 588, 653, 765, 653, 143, 588,
76 687, 205, 653, 237, 143, 237, 141, 648,
77 765, 765, 765, 765, 765, 765, 765, 765,
78 765, 765, 143, 143, 653, 143, 653, 143,
79 653, 765, 765, 765, 765, 765, 765, 765,
80 765, 765, 765, 765, 765, 765, 765, 765,
81 765, 765, 765, 765, 765, 765, 765, 765,
82 765, 765, 765, 683, 513, 197, 653, 765,
83 653, 765, 765, 765, 765, 765, 765, 765,
84 765, 765, 765, 765, 765, 765, 765, 765,
85 765, 765, 765, 765, 765, 765, 765, 765,
86 765, 765, 765, 687, 143, 207, 653, 765
87 };
88
89 static Py_ssize_t
rjsmin(const rchar * source,rchar * target,Py_ssize_t length,int keep_bang_comments)90 rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
91 int keep_bang_comments)
92 {
93 const rchar *reset, *pcreset = NULL, *pctoken = NULL, *xtarget,
94 *sentinel = source + length;
95 rchar *tstart = target;
96 int post_regex = 0;
97 rchar c, quote, spaced = U(' ');
98
99 while (source < sentinel) {
100 c = *source++;
101 if (RJSMIN_IS_DULL(c)) {
102 if (post_regex) post_regex = 0;
103 if (pctoken) pctoken = NULL;
104 if (spaced == U('\n')) spaced = U(' ');
105
106 *target++ = c;
107 continue;
108 }
109 switch (c) {
110
111 /* String */
112 case U('\''): case U('"'):
113 if (post_regex) post_regex = 0;
114 if (pctoken) pctoken = NULL;
115 if (spaced == U('\n')) spaced = U(' ');
116
117 reset = source;
118 *target++ = quote = c;
119 while (source < sentinel) {
120 c = *source++;
121 *target++ = c;
122 if (RJSMIN_IS_STRING_DULL(c))
123 continue;
124 switch (c) {
125 case U('\''): case U('"'):
126 if (c == quote)
127 goto cont;
128 continue;
129 case U('\\'):
130 if (source < sentinel) {
131 c = *source++;
132 *target++ = c;
133 if (c == U('\r') && source < sentinel
134 && *source == U('\n'))
135 *target++ = *source++;
136 }
137 continue;
138 }
139 break;
140 }
141 target -= source - reset;
142 source = reset;
143 continue;
144
145 /* Comment or Regex or something else entirely */
146 case U('/'):
147 if (!(source < sentinel)) {
148 if (post_regex) post_regex = 0;
149 if (pctoken) pctoken = NULL;
150 if (spaced == U('\n')) spaced = U(' ');
151
152 *target++ = c;
153 }
154 else {
155 switch (*source) {
156 /* Comment */
157 case U('*'): case U('/'):
158 goto skip_or_copy_ws;
159
160 default:
161 xtarget = NULL;
162 if ( target == tstart
163 || RJSMIN_IS_PRE_REGEX_1(*((pctoken ? pctoken : target)
164 - 1))
165 || (
166 (xtarget = pctoken ? pctoken : target)
167 && (xtarget - tstart >= 6)
168 && *(xtarget - 1) == U('n')
169 && *(xtarget - 2) == U('r')
170 && *(xtarget - 3) == U('u')
171 && *(xtarget - 4) == U('t')
172 && *(xtarget - 5) == U('e')
173 && *(xtarget - 6) == U('r')
174 && (
175 xtarget - tstart == 6
176 || !RJSMIN_IS_ID_LITERAL(*(xtarget - 7))
177 )
178 )) {
179
180 /* Regex */
181 if (post_regex) post_regex = 0;
182 if (pctoken) pctoken = NULL;
183
184 reset = source;
185 if (spaced == U('\n')) {
186 spaced = U(' ');
187 if (xtarget)
188 *target++ = U('\n');
189 }
190
191 *target++ = U('/');
192 while (source < sentinel) {
193 c = *source++;
194 *target++ = c;
195 if (RJSMIN_IS_REGEX_DULL(c))
196 continue;
197 switch (c) {
198 case U('/'):
199 post_regex = 1;
200 goto cont;
201 case U('\\'):
202 if (source < sentinel) {
203 c = *source++;
204 *target++ = c;
205 if (c == U('\r') || c == U('\n'))
206 break;
207 }
208 continue;
209 case U('['):
210 while (source < sentinel) {
211 c = *source++;
212 *target++ = c;
213 if (RJSMIN_IS_REGEX_CC_DULL(c))
214 continue;
215 switch (c) {
216 case U('\\'):
217 if (source < sentinel) {
218 c = *source++;
219 *target++ = c;
220 if (c == U('\r') || c == U('\n'))
221 break;
222 }
223 continue;
224 case U(']'):
225 goto cont_regex;
226 }
227 }
228 break;
229 }
230 break;
231 cont_regex:
232 continue;
233 }
234 target -= source - reset;
235 source = reset;
236 }
237 else {
238 /* Just a slash */
239 if (post_regex) post_regex = 0;
240 if (pctoken) pctoken = NULL;
241 if (spaced == U('\n')) spaced = U(' ');
242
243 *target++ = c;
244 }
245 continue;
246 }
247 }
248 continue;
249
250 /* Whitespace */
251 default:
252 skip_or_copy_ws:
253 quote = U(' ');
254 --source;
255 while (source < sentinel) {
256 c = *source++;
257 if (RJSMIN_IS_SPACE(c))
258 continue;
259 switch (c) {
260 case U('\r'): case U('\n'):
261 quote = U('\n');
262 continue;
263 case U('/'):
264 if (source < sentinel) {
265 switch (*source) {
266 case U('*'):
267 reset = source++;
268 /* copy bang comment, if requested */
269 if ( keep_bang_comments && source < sentinel
270 && *source == U('!')) {
271 if (!pctoken) {
272 pctoken = target;
273 pcreset = reset;
274 }
275
276 *target++ = U('/');
277 *target++ = U('*');
278 *target++ = *source++;
279 while (source < sentinel) {
280 c = *source++;
281 *target++ = c;
282 if (c == U('*') && source < sentinel
283 && *source == U('/')) {
284 *target++ = *source++;
285 reset = NULL;
286 break;
287 }
288 }
289 if (!reset)
290 continue;
291
292 target -= source - reset;
293 source = reset;
294 if (pcreset == reset) {
295 pctoken = NULL;
296 pcreset = NULL;
297 }
298
299 }
300 /* strip regular comment */
301 else {
302 while (source < sentinel) {
303 c = *source++;
304 if (c == U('*') && source < sentinel
305 && *source == U('/')) {
306 ++source;
307 reset = NULL;
308 break;
309 }
310 }
311 if (!reset)
312 continue;
313 source = reset;
314 *target++ = U('/');
315 }
316 goto cont;
317 case U('/'):
318 ++source;
319 while (source < sentinel) {
320 c = *source++;
321 switch (c) {
322 case U('\n'):
323 break;
324 case U('\r'):
325 if (source < sentinel
326 && *source == U('\n'))
327 ++source;
328 break;
329 default:
330 continue;
331 }
332 break;
333 }
334 quote = U('\n');
335 continue;
336 }
337 }
338 }
339 --source;
340 break;
341 }
342
343 if ((tstart < (pctoken ? pctoken : target) && source < sentinel)
344 && ((quote == U('\n')
345 && ((RJSMIN_IS_ID_LITERAL_CLOSE(*((pctoken ?
346 pctoken : target) - 1))
347 && RJSMIN_IS_ID_LITERAL_OPEN(*source))
348 || (post_regex
349 && RJSMIN_IS_POST_REGEX_OFF(*source)
350 && !(post_regex = 0))))
351 ||
352 (quote == U(' ') && !pctoken
353 && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
354 && RJSMIN_IS_ID_LITERAL(*source))
355 || (source < sentinel
356 && ((*(target - 1) == U('+')
357 && *source == U('+'))
358 || (*(target - 1) == U('-')
359 && *source == U('-')))))))) {
360 *target++ = quote;
361 }
362
363 pcreset = NULL;
364 spaced = quote;
365 }
366 cont:
367 continue;
368 }
369 return (Py_ssize_t)(target - tstart);
370 }
371
372
373 PyDoc_STRVAR(rjsmin_jsmin__doc__,
374 "jsmin(script, keep_bang_comments=False)\n\
375 \n\
376 Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
377 \n\
378 Instead of parsing the stream char by char, it uses a regular\n\
379 expression approach which minifies the whole script with one big\n\
380 substitution regex.\n\
381 \n\
382 .. _jsmin.c by Douglas Crockford:\n\
383 http://www.crockford.com/javascript/jsmin.c\n\
384 \n\
385 :Note: This is a hand crafted C implementation built on the regex\n\
386 semantics.\n\
387 \n\
388 :Parameters:\n\
389 `script` : ``str``\n\
390 Script to minify\n\
391 \n\
392 `keep_bang_comments` : ``bool``\n\
393 Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
394 \n\
395 :Return: Minified script\n\
396 :Rtype: ``str``");
397
398 static PyObject *
rjsmin_jsmin(PyObject * self,PyObject * args,PyObject * kwds)399 rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
400 {
401 PyObject *script, *keep_bang_comments_ = NULL, *result;
402 static char *kwlist[] = {"script", "keep_bang_comments", NULL};
403 Py_ssize_t slength, length;
404 int keep_bang_comments;
405 #ifdef EXT2
406 int uni;
407 #define UOBJ "O"
408 #endif
409 #ifdef EXT3
410 #define UOBJ "U"
411 #endif
412
413 if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
414 &script, &keep_bang_comments_))
415 return NULL;
416
417 if (!keep_bang_comments_)
418 keep_bang_comments = 0;
419 else {
420 keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
421 if (keep_bang_comments == -1)
422 return NULL;
423 }
424
425 #ifdef EXT2
426 if (PyUnicode_Check(script)) {
427 if (!(script = PyUnicode_AsUTF8String(script)))
428 return NULL;
429 uni = 1;
430 }
431 else {
432 if (!(script = PyObject_Str(script)))
433 return NULL;
434 uni = 0;
435 }
436 #endif
437
438 #ifdef EXT3
439 Py_INCREF(script);
440 #define PyString_GET_SIZE PyUnicode_GET_SIZE
441 #define PyString_AS_STRING PyUnicode_AS_UNICODE
442 #define _PyString_Resize PyUnicode_Resize
443 #define PyString_FromStringAndSize PyUnicode_FromUnicode
444 #endif
445
446 slength = PyString_GET_SIZE(script);
447 if (!(result = PyString_FromStringAndSize(NULL, slength))) {
448 Py_DECREF(script);
449 return NULL;
450 }
451 Py_BEGIN_ALLOW_THREADS
452 length = rjsmin((rchar *)PyString_AS_STRING(script),
453 (rchar *)PyString_AS_STRING(result),
454 slength, keep_bang_comments);
455 Py_END_ALLOW_THREADS
456
457 Py_DECREF(script);
458 if (length < 0) {
459 Py_DECREF(result);
460 return NULL;
461 }
462 if (length != slength && _PyString_Resize(&result, length) == -1)
463 return NULL;
464
465 #ifdef EXT2
466 if (uni) {
467 script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
468 PyString_GET_SIZE(result), "strict");
469 Py_DECREF(result);
470 if (!script)
471 return NULL;
472 result = script;
473 }
474 #endif
475 return result;
476 }
477
478 /* ------------------------ BEGIN MODULE DEFINITION ------------------------ */
479
480 EXT_METHODS = {
481 {"jsmin",
482 (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
483 rjsmin_jsmin__doc__},
484
485 {NULL} /* Sentinel */
486 };
487
488 PyDoc_STRVAR(EXT_DOCS_VAR,
489 "C implementation of rjsmin\n\
490 ==========================\n\
491 \n\
492 C implementation of rjsmin.");
493
494
495 EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);
496
497 EXT_INIT_FUNC {
498 PyObject *m;
499
500 /* Create the module and populate stuff */
501 if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
502 EXT_INIT_ERROR(NULL);
503
504 EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
505 EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");
506
507 EXT_INIT_RETURN(m);
508 }
509
510 /* ------------------------- END MODULE DEFINITION ------------------------- */
511