1 /*
2 * nghttp2 - HTTP/2 C Library
3 *
4 * Copyright (c) 2012 Tatsuhiro Tsujikawa
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25 #include "HtmlParser.h"
26
27 #include <libxml/uri.h>
28
29 #include "util.h"
30
31 namespace nghttp2 {
32
ParserData(const std::string & base_uri)33 ParserData::ParserData(const std::string &base_uri)
34 : base_uri(base_uri), inside_head(0) {}
35
HtmlParser(const std::string & base_uri)36 HtmlParser::HtmlParser(const std::string &base_uri)
37 : base_uri_(base_uri), parser_ctx_(nullptr), parser_data_(base_uri) {}
38
~HtmlParser()39 HtmlParser::~HtmlParser() { htmlFreeParserCtxt(parser_ctx_); }
40
41 namespace {
get_attr(const xmlChar ** attrs,const StringRef & name)42 StringRef get_attr(const xmlChar **attrs, const StringRef &name) {
43 if (attrs == nullptr) {
44 return StringRef{};
45 }
46 for (; *attrs; attrs += 2) {
47 if (util::strieq(StringRef{attrs[0], strlen(reinterpret_cast<const char *>(
48 attrs[0]))},
49 name)) {
50 return StringRef{attrs[1],
51 strlen(reinterpret_cast<const char *>(attrs[1]))};
52 }
53 }
54 return StringRef{};
55 }
56 } // namespace
57
58 namespace {
59 ResourceType
get_resource_type_for_preload_as(const StringRef & attribute_value)60 get_resource_type_for_preload_as(const StringRef &attribute_value) {
61 if (util::strieq_l("image", attribute_value)) {
62 return REQ_IMG;
63 } else if (util::strieq_l("style", attribute_value)) {
64 return REQ_CSS;
65 } else if (util::strieq_l("script", attribute_value)) {
66 return REQ_UNBLOCK_JS;
67 } else {
68 return REQ_OTHERS;
69 }
70 }
71 } // namespace
72
73 namespace {
add_link(ParserData * parser_data,const StringRef & uri,ResourceType res_type)74 void add_link(ParserData *parser_data, const StringRef &uri,
75 ResourceType res_type) {
76 auto u = xmlBuildURI(
77 reinterpret_cast<const xmlChar *>(uri.c_str()),
78 reinterpret_cast<const xmlChar *>(parser_data->base_uri.c_str()));
79 if (u) {
80 parser_data->links.push_back(
81 std::make_pair(reinterpret_cast<char *>(u), res_type));
82 free(u);
83 }
84 }
85 } // namespace
86
87 namespace {
start_element_func(void * user_data,const xmlChar * src_name,const xmlChar ** attrs)88 void start_element_func(void *user_data, const xmlChar *src_name,
89 const xmlChar **attrs) {
90 auto parser_data = static_cast<ParserData *>(user_data);
91 auto name =
92 StringRef{src_name, strlen(reinterpret_cast<const char *>(src_name))};
93 if (util::strieq_l("head", name)) {
94 ++parser_data->inside_head;
95 }
96 if (util::strieq_l("link", name)) {
97 auto rel_attr = get_attr(attrs, StringRef::from_lit("rel"));
98 auto href_attr = get_attr(attrs, StringRef::from_lit("href"));
99 if (rel_attr.empty() || href_attr.empty()) {
100 return;
101 }
102 if (util::strieq_l("shortcut icon", rel_attr)) {
103 add_link(parser_data, href_attr, REQ_OTHERS);
104 } else if (util::strieq_l("stylesheet", rel_attr)) {
105 add_link(parser_data, href_attr, REQ_CSS);
106 } else if (util::strieq_l("preload", rel_attr)) {
107 auto as_attr = get_attr(attrs, StringRef::from_lit("as"));
108 if (as_attr.empty()) {
109 return;
110 }
111 add_link(parser_data, href_attr,
112 get_resource_type_for_preload_as(as_attr));
113 }
114 } else if (util::strieq_l("img", name)) {
115 auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
116 if (src_attr.empty()) {
117 return;
118 }
119 add_link(parser_data, src_attr, REQ_IMG);
120 } else if (util::strieq_l("script", name)) {
121 auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
122 if (src_attr.empty()) {
123 return;
124 }
125 if (parser_data->inside_head) {
126 add_link(parser_data, src_attr, REQ_JS);
127 } else {
128 add_link(parser_data, src_attr, REQ_UNBLOCK_JS);
129 }
130 }
131 }
132 } // namespace
133
134 namespace {
end_element_func(void * user_data,const xmlChar * name)135 void end_element_func(void *user_data, const xmlChar *name) {
136 auto parser_data = static_cast<ParserData *>(user_data);
137 if (util::strieq_l(
138 "head",
139 StringRef{name, strlen(reinterpret_cast<const char *>(name))})) {
140 --parser_data->inside_head;
141 }
142 }
143 } // namespace
144
145 namespace {
146 xmlSAXHandler saxHandler = {
147 nullptr, // internalSubsetSAXFunc
148 nullptr, // isStandaloneSAXFunc
149 nullptr, // hasInternalSubsetSAXFunc
150 nullptr, // hasExternalSubsetSAXFunc
151 nullptr, // resolveEntitySAXFunc
152 nullptr, // getEntitySAXFunc
153 nullptr, // entityDeclSAXFunc
154 nullptr, // notationDeclSAXFunc
155 nullptr, // attributeDeclSAXFunc
156 nullptr, // elementDeclSAXFunc
157 nullptr, // unparsedEntityDeclSAXFunc
158 nullptr, // setDocumentLocatorSAXFunc
159 nullptr, // startDocumentSAXFunc
160 nullptr, // endDocumentSAXFunc
161 &start_element_func, // startElementSAXFunc
162 &end_element_func, // endElementSAXFunc
163 nullptr, // referenceSAXFunc
164 nullptr, // charactersSAXFunc
165 nullptr, // ignorableWhitespaceSAXFunc
166 nullptr, // processingInstructionSAXFunc
167 nullptr, // commentSAXFunc
168 nullptr, // warningSAXFunc
169 nullptr, // errorSAXFunc
170 nullptr, // fatalErrorSAXFunc
171 nullptr, // getParameterEntitySAXFunc
172 nullptr, // cdataBlockSAXFunc
173 nullptr, // externalSubsetSAXFunc
174 0, // unsigned int initialized
175 nullptr, // void * _private
176 nullptr, // startElementNsSAX2Func
177 nullptr, // endElementNsSAX2Func
178 nullptr, // xmlStructuredErrorFunc
179 };
180 } // namespace
181
parse_chunk(const char * chunk,size_t size,int fin)182 int HtmlParser::parse_chunk(const char *chunk, size_t size, int fin) {
183 if (!parser_ctx_) {
184 parser_ctx_ =
185 htmlCreatePushParserCtxt(&saxHandler, &parser_data_, chunk, size,
186 base_uri_.c_str(), XML_CHAR_ENCODING_NONE);
187 if (!parser_ctx_) {
188 return -1;
189 } else {
190 if (fin) {
191 return parse_chunk_internal(nullptr, 0, fin);
192 } else {
193 return 0;
194 }
195 }
196 } else {
197 return parse_chunk_internal(chunk, size, fin);
198 }
199 }
200
parse_chunk_internal(const char * chunk,size_t size,int fin)201 int HtmlParser::parse_chunk_internal(const char *chunk, size_t size, int fin) {
202 int rv = htmlParseChunk(parser_ctx_, chunk, size, fin);
203 if (rv == 0) {
204 return 0;
205 } else {
206 return -1;
207 }
208 }
209
210 const std::vector<std::pair<std::string, ResourceType>> &
get_links() const211 HtmlParser::get_links() const {
212 return parser_data_.links;
213 }
214
clear_links()215 void HtmlParser::clear_links() { parser_data_.links.clear(); }
216
217 } // namespace nghttp2
218