1 // Copyright (c) 2012 The WebM project authors. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the LICENSE file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8
9 #include "webvttparser.h"
10
11 #include <ctype.h>
12
13 #include <climits>
14 #include <cstddef>
15
16 namespace libwebvtt {
17
18 // NOLINT'ing this enum because clang-format puts it in a single line which
19 // makes it look really unreadable.
20 enum {
21 kNUL = '\x00',
22 kSPACE = ' ',
23 kTAB = '\x09',
24 kLF = '\x0A',
25 kCR = '\x0D'
26 }; // NOLINT
27
~Reader()28 Reader::~Reader() {}
29
~LineReader()30 LineReader::~LineReader() {}
31
GetLine(std::string * line_ptr)32 int LineReader::GetLine(std::string* line_ptr) {
33 if (line_ptr == NULL)
34 return -1;
35
36 std::string& ln = *line_ptr;
37 ln.clear();
38
39 // Consume characters from the stream, until we
40 // reach end-of-line (or end-of-stream).
41
42 // The WebVTT spec states that lines may be
43 // terminated in any of these three ways:
44 // LF
45 // CR
46 // CR LF
47
48 // We interrogate each character as we read it from the stream.
49 // If we detect an end-of-line character, we consume the full
50 // end-of-line indication, and we're done; otherwise, accumulate
51 // the character and repeat.
52
53 for (;;) {
54 char c;
55 const int e = GetChar(&c);
56
57 if (e < 0) // error
58 return e;
59
60 if (e > 0) // EOF
61 return (ln.empty()) ? 1 : 0;
62
63 // We have a character, so we must first determine
64 // whether we have reached end-of-line.
65
66 if (c == kLF)
67 return 0; // handle the easy end-of-line case immediately
68
69 if (c == kCR)
70 break; // handle the hard end-of-line case outside of loop
71
72 if (c == '\xFE' || c == '\xFF') // not UTF-8
73 return -1;
74
75 // To defend against pathological or malicious streams, we
76 // cap the line length at some arbitrarily-large value:
77 enum { kMaxLineLength = 10000 }; // arbitrary
78
79 if (ln.length() >= kMaxLineLength)
80 return -1;
81
82 // We don't have an end-of-line character, so accumulate
83 // the character in our line buffer.
84 ln.push_back(c);
85 }
86
87 // We detected a CR. We must interrogate the next character
88 // in the stream, to determine whether we have a LF (which
89 // would make it part of this same line).
90
91 char c;
92 const int e = GetChar(&c);
93
94 if (e < 0) // error
95 return e;
96
97 if (e > 0) // EOF
98 return 0;
99
100 // If next character in the stream is not a LF, return it
101 // to the stream (because it's part of the next line).
102 if (c != kLF)
103 UngetChar(c);
104
105 return 0;
106 }
107
Parser(Reader * r)108 Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}
109
~Parser()110 Parser::~Parser() {}
111
Init()112 int Parser::Init() {
113 int e = ParseBOM();
114
115 if (e < 0) // error
116 return e;
117
118 if (e > 0) // EOF
119 return -1;
120
121 // Parse "WEBVTT". We read from the stream one character at-a-time, in
122 // order to defend against non-WebVTT streams (e.g. binary files) that don't
123 // happen to comprise lines of text demarcated with line terminators.
124
125 const char kId[] = "WEBVTT";
126
127 for (const char* p = kId; *p; ++p) {
128 char c;
129 e = GetChar(&c);
130
131 if (e < 0) // error
132 return e;
133
134 if (e > 0) // EOF
135 return -1;
136
137 if (c != *p)
138 return -1;
139 }
140
141 std::string line;
142
143 e = GetLine(&line);
144
145 if (e < 0) // error
146 return e;
147
148 if (e > 0) // EOF
149 return 0; // weird but valid
150
151 if (!line.empty()) {
152 // Parse optional characters that follow "WEBVTT"
153
154 const char c = line[0];
155
156 if (c != kSPACE && c != kTAB)
157 return -1;
158 }
159
160 // The WebVTT spec requires that the "WEBVTT" line
161 // be followed by an empty line (to separate it from
162 // first cue).
163
164 e = GetLine(&line);
165
166 if (e < 0) // error
167 return e;
168
169 if (e > 0) // EOF
170 return 0; // weird but we allow it
171
172 if (!line.empty())
173 return -1;
174
175 return 0; // success
176 }
177
Parse(Cue * cue)178 int Parser::Parse(Cue* cue) {
179 if (cue == NULL)
180 return -1;
181
182 // Parse first non-blank line
183
184 std::string line;
185 int e;
186
187 for (;;) {
188 e = GetLine(&line);
189
190 if (e) // EOF is OK here
191 return e;
192
193 if (!line.empty())
194 break;
195 }
196
197 // A WebVTT cue comprises an optional cue identifier line followed
198 // by a (non-optional) timings line. You determine whether you have
199 // a timings line by scanning for the arrow token, the lexeme of which
200 // may not appear in the cue identifier line.
201
202 const char kArrow[] = "-->";
203 std::string::size_type arrow_pos = line.find(kArrow);
204
205 if (arrow_pos != std::string::npos) {
206 // We found a timings line, which implies that we don't have a cue
207 // identifier.
208
209 cue->identifier.clear();
210 } else {
211 // We did not find a timings line, so we assume that we have a cue
212 // identifier line, and then try again to find the cue timings on
213 // the next line.
214
215 cue->identifier.swap(line);
216
217 e = GetLine(&line);
218
219 if (e < 0) // error
220 return e;
221
222 if (e > 0) // EOF
223 return -1;
224
225 arrow_pos = line.find(kArrow);
226
227 if (arrow_pos == std::string::npos) // not a timings line
228 return -1;
229 }
230
231 e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
232 &cue->settings);
233
234 if (e) // error
235 return e;
236
237 // The cue payload comprises all the non-empty
238 // lines that follow the timings line.
239
240 Cue::payload_t& p = cue->payload;
241 p.clear();
242
243 for (;;) {
244 e = GetLine(&line);
245
246 if (e < 0) // error
247 return e;
248
249 if (line.empty())
250 break;
251
252 p.push_back(line);
253 }
254
255 if (p.empty())
256 return -1;
257
258 return 0; // success
259 }
260
GetChar(char * c)261 int Parser::GetChar(char* c) {
262 if (unget_ >= 0) {
263 *c = static_cast<char>(unget_);
264 unget_ = -1;
265 return 0;
266 }
267
268 return reader_->GetChar(c);
269 }
270
UngetChar(char c)271 void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }
272
ParseBOM()273 int Parser::ParseBOM() {
274 // Explanation of UTF-8 BOM:
275 // http://en.wikipedia.org/wiki/Byte_order_mark
276
277 static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM
278
279 for (int i = 0; i < 3; ++i) {
280 char c;
281 int e = GetChar(&c);
282
283 if (e < 0) // error
284 return e;
285
286 if (e > 0) // EOF
287 return 1;
288
289 if (c != BOM[i]) {
290 if (i == 0) { // we don't have a BOM
291 UngetChar(c);
292 return 0; // success
293 }
294
295 // We started a BOM, so we must finish the BOM.
296 return -1; // error
297 }
298 }
299
300 return 0; // success
301 }
302
ParseTimingsLine(std::string * line_ptr,std::string::size_type arrow_pos,Time * start_time,Time * stop_time,Cue::settings_t * settings)303 int Parser::ParseTimingsLine(std::string* line_ptr,
304 std::string::size_type arrow_pos, Time* start_time,
305 Time* stop_time, Cue::settings_t* settings) {
306 if (line_ptr == NULL)
307 return -1;
308
309 std::string& line = *line_ptr;
310
311 if (arrow_pos == std::string::npos || arrow_pos >= line.length())
312 return -1;
313
314 // Place a NUL character at the start of the arrow token, in
315 // order to demarcate the start time from remainder of line.
316 line[arrow_pos] = kNUL;
317 std::string::size_type idx = 0;
318
319 int e = ParseTime(line, &idx, start_time);
320 if (e) // error
321 return e;
322
323 // Detect any junk that follows the start time,
324 // but precedes the arrow symbol.
325
326 while (char c = line[idx]) {
327 if (c != kSPACE && c != kTAB)
328 return -1;
329 ++idx;
330 }
331
332 // Place a NUL character at the end of the line,
333 // so the scanner has a place to stop, and begin
334 // the scan just beyond the arrow token.
335
336 line.push_back(kNUL);
337 idx = arrow_pos + 3;
338
339 e = ParseTime(line, &idx, stop_time);
340 if (e) // error
341 return e;
342
343 e = ParseSettings(line, idx, settings);
344 if (e) // error
345 return e;
346
347 return 0; // success
348 }
349
ParseTime(const std::string & line,std::string::size_type * idx_ptr,Time * time)350 int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
351 Time* time) {
352 if (idx_ptr == NULL)
353 return -1;
354
355 std::string::size_type& idx = *idx_ptr;
356
357 if (idx == std::string::npos || idx >= line.length())
358 return -1;
359
360 if (time == NULL)
361 return -1;
362
363 // Consume any whitespace that precedes the timestamp.
364
365 while (char c = line[idx]) {
366 if (c != kSPACE && c != kTAB)
367 break;
368 ++idx;
369 }
370
371 // WebVTT timestamp syntax comes in three flavors:
372 // SS[.sss]
373 // MM:SS[.sss]
374 // HH:MM:SS[.sss]
375
376 // Parse a generic number value. We don't know which component
377 // of the time we have yet, until we do more parsing.
378
379 int val = ParseNumber(line, &idx);
380
381 if (val < 0) // error
382 return val;
383
384 Time& t = *time;
385
386 // The presence of a colon character indicates that we have
387 // an [HH:]MM:SS style syntax.
388
389 if (line[idx] == ':') {
390 // We have either HH:MM:SS or MM:SS
391
392 // The value we just parsed is either the hours or minutes.
393 // It must be followed by another number value (that is
394 // either minutes or seconds).
395
396 const int first_val = val;
397
398 ++idx; // consume colon
399
400 // Parse second value
401
402 val = ParseNumber(line, &idx);
403
404 if (val < 0)
405 return val;
406
407 if (val >= 60) // either MM or SS
408 return -1;
409
410 if (line[idx] == ':') {
411 // We have HH:MM:SS
412
413 t.hours = first_val;
414 t.minutes = val; // vetted above
415
416 ++idx; // consume MM:SS colon
417
418 // We have parsed the hours and minutes.
419 // We must now parse the seconds.
420
421 val = ParseNumber(line, &idx);
422
423 if (val < 0)
424 return val;
425
426 if (val >= 60) // SS part of HH:MM:SS
427 return -1;
428
429 t.seconds = val;
430 } else {
431 // We have MM:SS
432
433 // The implication here is that the hour value was omitted
434 // from the timestamp (because it was 0).
435
436 if (first_val >= 60) // minutes
437 return -1;
438
439 t.hours = 0;
440 t.minutes = first_val;
441 t.seconds = val; // vetted above
442 }
443 } else {
444 // We have SS (only)
445
446 // The time is expressed as total number of seconds,
447 // so the seconds value has no upper bound.
448
449 t.seconds = val;
450
451 // Convert SS to HH:MM:SS
452
453 t.minutes = t.seconds / 60;
454 t.seconds -= t.minutes * 60;
455
456 t.hours = t.minutes / 60;
457 t.minutes -= t.hours * 60;
458 }
459
460 // We have parsed the hours, minutes, and seconds.
461 // We must now parse the milliseconds.
462
463 char c = line[idx];
464
465 // TODO(matthewjheaney): one option here is to slightly relax the
466 // syntax rules for WebVTT timestamps, to permit the comma character
467 // to also be used as the seconds/milliseconds separator. This
468 // would handle streams that use localization conventions for
469 // countries in Western Europe. For now we obey the rules specified
470 // in the WebVTT spec (allow "full stop" only).
471
472 const bool have_milliseconds = (c == '.');
473
474 if (!have_milliseconds) {
475 t.milliseconds = 0;
476 } else {
477 ++idx; // consume FULL STOP
478
479 val = ParseNumber(line, &idx);
480
481 if (val < 0)
482 return val;
483
484 if (val >= 1000)
485 return -1;
486
487 if (val < 10)
488 t.milliseconds = val * 100;
489 else if (val < 100)
490 t.milliseconds = val * 10;
491 else
492 t.milliseconds = val;
493 }
494
495 // We have parsed the time proper. We must check for any
496 // junk that immediately follows the time specifier.
497
498 c = line[idx];
499
500 if (c != kNUL && c != kSPACE && c != kTAB)
501 return -1;
502
503 return 0; // success
504 }
505
ParseSettings(const std::string & line,std::string::size_type idx,Cue::settings_t * settings)506 int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
507 Cue::settings_t* settings) {
508 settings->clear();
509
510 if (idx == std::string::npos || idx >= line.length())
511 return -1;
512
513 for (;;) {
514 // We must parse a line comprising a sequence of 0 or more
515 // NAME:VALUE pairs, separated by whitespace. The line iself is
516 // terminated with a NUL char (indicating end-of-line).
517
518 for (;;) {
519 const char c = line[idx];
520
521 if (c == kNUL) // end-of-line
522 return 0; // success
523
524 if (c != kSPACE && c != kTAB)
525 break;
526
527 ++idx; // consume whitespace
528 }
529
530 // We have consumed the whitespace, and have not yet reached
531 // end-of-line, so there is something on the line for us to parse.
532
533 settings->push_back(Setting());
534 Setting& s = settings->back();
535
536 // Parse the NAME part of the settings pair.
537
538 for (;;) {
539 const char c = line[idx];
540
541 if (c == ':') // we have reached end of NAME part
542 break;
543
544 if (c == kNUL || c == kSPACE || c == kTAB)
545 return -1;
546
547 s.name.push_back(c);
548
549 ++idx;
550 }
551
552 if (s.name.empty())
553 return -1;
554
555 ++idx; // consume colon
556
557 // Parse the VALUE part of the settings pair.
558
559 for (;;) {
560 const char c = line[idx];
561
562 if (c == kNUL || c == kSPACE || c == kTAB)
563 break;
564
565 if (c == ':') // suspicious when part of VALUE
566 return -1; // TODO(matthewjheaney): verify this behavior
567
568 s.value.push_back(c);
569
570 ++idx;
571 }
572
573 if (s.value.empty())
574 return -1;
575 }
576 }
577
ParseNumber(const std::string & line,std::string::size_type * idx_ptr)578 int Parser::ParseNumber(const std::string& line,
579 std::string::size_type* idx_ptr) {
580 if (idx_ptr == NULL)
581 return -1;
582
583 std::string::size_type& idx = *idx_ptr;
584
585 if (idx == std::string::npos || idx >= line.length())
586 return -1;
587
588 if (!isdigit(line[idx]))
589 return -1;
590
591 int result = 0;
592
593 while (isdigit(line[idx])) {
594 const char c = line[idx];
595 const int i = c - '0';
596
597 if (result > INT_MAX / 10)
598 return -1;
599
600 result *= 10;
601
602 if (result > INT_MAX - i)
603 return -1;
604
605 result += i;
606
607 ++idx;
608 }
609
610 return result;
611 }
612
operator ==(const Time & rhs) const613 bool Time::operator==(const Time& rhs) const {
614 if (hours != rhs.hours)
615 return false;
616
617 if (minutes != rhs.minutes)
618 return false;
619
620 if (seconds != rhs.seconds)
621 return false;
622
623 return (milliseconds == rhs.milliseconds);
624 }
625
operator <(const Time & rhs) const626 bool Time::operator<(const Time& rhs) const {
627 if (hours < rhs.hours)
628 return true;
629
630 if (hours > rhs.hours)
631 return false;
632
633 if (minutes < rhs.minutes)
634 return true;
635
636 if (minutes > rhs.minutes)
637 return false;
638
639 if (seconds < rhs.seconds)
640 return true;
641
642 if (seconds > rhs.seconds)
643 return false;
644
645 return (milliseconds < rhs.milliseconds);
646 }
647
operator >(const Time & rhs) const648 bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }
649
operator <=(const Time & rhs) const650 bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }
651
operator >=(const Time & rhs) const652 bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }
653
presentation() const654 presentation_t Time::presentation() const {
655 const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
656 const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
657 const presentation_t s = 1000LL * presentation_t(seconds);
658 const presentation_t result = h + m + s + milliseconds;
659 return result;
660 }
661
presentation(presentation_t d)662 Time& Time::presentation(presentation_t d) {
663 if (d < 0) { // error
664 hours = 0;
665 minutes = 0;
666 seconds = 0;
667 milliseconds = 0;
668
669 return *this;
670 }
671
672 seconds = static_cast<int>(d / 1000);
673 milliseconds = static_cast<int>(d - 1000 * seconds);
674
675 minutes = seconds / 60;
676 seconds -= 60 * minutes;
677
678 hours = minutes / 60;
679 minutes -= 60 * hours;
680
681 return *this;
682 }
683
operator +=(presentation_t rhs)684 Time& Time::operator+=(presentation_t rhs) {
685 const presentation_t d = this->presentation();
686 const presentation_t dd = d + rhs;
687 this->presentation(dd);
688 return *this;
689 }
690
operator +(presentation_t d) const691 Time Time::operator+(presentation_t d) const {
692 Time t(*this);
693 t += d;
694 return t;
695 }
696
operator -=(presentation_t d)697 Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }
698
operator -(const Time & t) const699 presentation_t Time::operator-(const Time& t) const {
700 const presentation_t rhs = t.presentation();
701 const presentation_t lhs = this->presentation();
702 const presentation_t result = lhs - rhs;
703 return result;
704 }
705
706 } // namespace libwebvtt
707