maug
Quick and dirty C mini-augmentation library.
mhtml.h
1
2#ifndef MHTML_H
3#define MHTML_H
4
5#ifndef MHTML_PARSER_TAGS_INIT_SZ
6# define MHTML_PARSER_TAGS_INIT_SZ 10
7#endif /* !MHTML_PARSER_TAGS_INIT_SZ */
8
9#ifdef MHTML_C
10# define MCSS_C
11#endif /* MHTML_C */
12
13#ifndef MHTML_DUMP_LINE_SZ
14# define MHTML_DUMP_LINE_SZ 255
15#endif /* !MHTML_DUMP_LINE_SZ */
16
17#ifndef MHTML_SRC_HREF_SZ_MAX
18# define MHTML_SRC_HREF_SZ_MAX 128
19#endif /* !MHTML_SRC_HREF_SZ_MAX */
20
21#ifndef MHTML_TRACE_LVL
22# define MHTML_TRACE_LVL 0
23#endif /* !MHTML_TRACE_LVL */
24
26#define MHTML_TAG_FLAG_STYLE 0x02
27
28#define MHTML_INPUT_TYPE_BUTTON 0x01
29
30#include <mparser.h>
31#include <mcss.h>
32
33#define MHTML_ATTRIB_TABLE( f ) \
34 f( NONE, 0 ) \
35 f( STYLE, 1 ) \
36 f( CLASS, 2 ) \
37 f( ID, 3 ) \
38 f( NAME, 4 ) \
39 f( SRC, 5 ) \
40 f( TYPE, 6 ) \
41 f( VALUE, 7 )
42
43#define MHTML_TAG_TABLE( f ) \
44 f( 0, NONE, void* none;, NONE ) \
45 f( 1, BODY, void* none;, BLOCK ) \
46 f( 2, DIV, void* none;, BLOCK ) \
47 f( 3, HEAD, void* none;, NONE ) \
48 f( 4, HTML, void* none;, BLOCK ) \
49 f( 5, TEXT, mdata_strpool_idx_t content_idx; size_t content_sz;, INLINE ) \
50 f( 6, TITLE, mdata_strpool_idx_t content_idx; size_t content_sz;, NONE ) \
51 f( 7, SPAN, void* none;, INLINE ) \
52 f( 8, BR, void* none;, BLOCK ) \
53 f( 9, STYLE, void* none;, NONE ) \
54 f( 10, IMG, char src[MHTML_SRC_HREF_SZ_MAX + 1]; size_t src_sz;, BLOCK ) \
55 f( 11, INPUT, uint8_t input_type; char name[MCSS_ID_SZ_MAX + 1]; size_t name_sz; char value[MCSS_ID_SZ_MAX + 1]; size_t value_sz;, INLINE )
56
57#define MHTML_PARSER_PSTATE_TABLE( f ) \
58 f( MHTML_PSTATE_NONE, 0 ) \
59 f( MHTML_PSTATE_ELEMENT, 1 ) \
60 f( MHTML_PSTATE_ATTRIB_KEY, 2 ) \
61 f( MHTML_PSTATE_ATTRIB_VAL, 3 ) \
62 f( MHTML_PSTATE_END_ELEMENT, 4 ) \
63 f( MHTML_PSTATE_STRING, 5 ) \
64 f( MHTML_PSTATE_STYLE, 6 )
65
66/* TODO: Function names should be verb_noun! */
67
68#if 0
69#define mhtml_tag( parser, idx ) (&((parser)->tags[idx]))
70
71#define mhtml_tag_parent( parser, idx ) \
72 (0 <= (parser)->tags[idx].parent ? \
73 (&((parser)->tags[(parser)->tags[idx].parent]])) : NULL)
74
75#define mhtml_tag_child( parser, idx ) \
76 (0 <= (parser)->tags[idx].first_child ? \
77 (&((parser)->tags[(parser)->tags[idx].first_child]])) : NULL)
78
79#define mhtml_tag_sibling( parser, idx ) \
80 (0 <= (parser)->tags[idx].next_sibling ? \
81 (&((parser)->tags[(parser)->tags[idx].next_sibling]])) : NULL)
82#endif
83
84#define mhtml_parser_pstate( parser ) \
85 mparser_pstate( &((parser)->base) )
86
87#ifdef MPARSER_TRACE_NAMES
88# define mhtml_parser_pstate_push( parser, new_pstate ) \
89 mparser_pstate_push( \
90 "mhtml", &((parser)->base), new_pstate, gc_mhtml_pstate_names )
91
92# define mhtml_parser_pstate_pop( parser ) \
93 mparser_pstate_pop( \
94 "mhtml", &((parser)->base), gc_mhtml_pstate_names )
95#else
96# define mhtml_parser_pstate_push( parser, new_pstate ) \
97 mparser_pstate_push( "mhtml", &((parser)->base), new_pstate )
98
99# define mhtml_parser_pstate_pop( parser ) \
100 mparser_pstate_pop( "mhtml", &((parser)->base) )
101#endif /* MPARSER_TRACE_NAMES */
102
103#define mhtml_parser_invalid_c( parser, c, retval ) \
104 mparser_invalid_c( mhtml, &((parser)->base), c, retval )
105
106#define mhtml_parser_reset_token( parser ) \
107 mparser_reset_token( "mhtml", &((parser)->base) )
108
109#define mhtml_parser_append_token( parser, c ) \
110 mparser_append_token( "mhtml", &((parser)->base), c )
111
112#define mhtml_parser_set_tag_iter( parser, iter ) \
113 debug_printf( MHTML_TRACE_LVL, "setting tag_iter to: " SSIZE_T_FMT \
114 " (previously: " SSIZE_T_FMT ")", (ssize_t)iter, (parser)->tag_iter ); \
115 (parser)->tag_iter = iter;
116
117#define mhtml_parser_is_locked( parser ) (NULL != (parser)->tags)
118
120 uint16_t type;
121 uint8_t flags;
122 ssize_t parent;
123 ssize_t first_child;
124 ssize_t next_sibling;
125 ssize_t style;
126 /* TODO: Use str_stable for classes. */
127 char classes[MCSS_CLASS_SZ_MAX + 1];
128 size_t classes_sz;
129 /* TODO: Use str_stable for id. */
130 char id[MCSS_ID_SZ_MAX + 1];
131 size_t id_sz;
132};
133
134#define MHTML_TAG_TABLE_STRUCT( tag_id, tag_name, fields, disp ) \
135 struct MHTML_TAG_ ## tag_name { \
136 struct MHTML_TAG_BASE base; \
137 fields \
138 };
139
140MHTML_TAG_TABLE( MHTML_TAG_TABLE_STRUCT )
141
142#define MHTML_TAG_TABLE_UNION_FIELD( tag_id, tag_name, fields, disp ) \
143 struct MHTML_TAG_ ## tag_name tag_name;
144
146 struct MHTML_TAG_BASE base; /* Should line up w/ 1st "base" in all types. */
147 MHTML_TAG_TABLE( MHTML_TAG_TABLE_UNION_FIELD )
148};
149
151 struct MPARSER base;
152 uint16_t attrib_key;
153 ssize_t tag_iter;
158 uint8_t tag_flags;
159 struct MCSS_PARSER styler;
160 struct MDATA_STRPOOL strpool;
161 struct MDATA_VECTOR tags;
162 ssize_t body_idx;
163};
164
165MERROR_RETVAL mhtml_parser_free( struct MHTML_PARSER* parser );
166
167MERROR_RETVAL mhtml_pop_tag( struct MHTML_PARSER* parser );
168
169MERROR_RETVAL mhtml_parse_c( struct MHTML_PARSER* parser, char c );
170
171MERROR_RETVAL mhtml_parser_init( struct MHTML_PARSER* parser );
172
173MERROR_RETVAL mhtml_dump_tree(
174 struct MHTML_PARSER* parser, ssize_t iter, size_t d );
175
176#ifdef MHTML_C
177
178#define MHTML_PSTATE_TABLE_CONST( name, idx ) \
179 MAUG_CONST uint8_t SEG_MCONST name = idx;
180
181MHTML_PARSER_PSTATE_TABLE( MHTML_PSTATE_TABLE_CONST )
182
183MPARSER_PSTATE_NAMES( MHTML_PARSER_PSTATE_TABLE, mhtml )
184
185#define MHTML_TAG_TABLE_CONST( tag_id, tag_name, fields, disp ) \
186 MAUG_CONST uint16_t SEG_MCONST MHTML_TAG_TYPE_ ## tag_name = tag_id;
187
188MHTML_TAG_TABLE( MHTML_TAG_TABLE_CONST )
189
190#define MHTML_TAG_TABLE_NAMES( tag_id, tag_name, fields, disp ) \
191 #tag_name,
192
193MAUG_CONST char* SEG_MCONST gc_mhtml_tag_names[] = {
194 MHTML_TAG_TABLE( MHTML_TAG_TABLE_NAMES )
195 ""
196};
197
198#define MHTML_ATTRIB_TABLE_NAME( name, idx ) \
199 #name,
200
201static MAUG_CONST char* SEG_MCONST gc_mhtml_attrib_names[] = {
202 MHTML_ATTRIB_TABLE( MHTML_ATTRIB_TABLE_NAME )
203 ""
204};
205
206#define MHTML_ATTRIB_TABLE_NAME_CONST( attrib_name, attrib_id ) \
207 MAUG_CONST uint16_t SEG_MCONST MHTML_ATTRIB_KEY_ ## attrib_name = attrib_id;
208
209MHTML_ATTRIB_TABLE( MHTML_ATTRIB_TABLE_NAME_CONST )
210
211MERROR_RETVAL mhtml_parser_free( struct MHTML_PARSER* parser ) {
212 MERROR_RETVAL retval = MERROR_OK;
213 union MHTML_TAG* tag_iter = NULL;
214
215 debug_printf( MHTML_TRACE_LVL, "freeing HTML parser..." );
216
217 mdata_strpool_free( &(parser->strpool) );
218
219 mdata_vector_lock( &(parser->tags) );
220
221 while( 0 < mdata_vector_ct( &(parser->tags) ) ) {
222 tag_iter = mdata_vector_get( &(parser->tags), 0, union MHTML_TAG );
223 assert( NULL != tag_iter );
224
225 mdata_vector_unlock( &(parser->tags) );
226 mdata_vector_remove( &(parser->tags), 0 );
227 mdata_vector_lock( &(parser->tags) );
228 }
229
230cleanup:
231
232 mcss_parser_free( &(parser->styler) );
233
234 if( mdata_vector_is_locked( &(parser->tags) ) ) {
235 mdata_vector_unlock( &(parser->tags) );
236 }
237
238 mdata_vector_free( &(parser->tags) );
239
240 return retval;
241}
242
243MERROR_RETVAL mhtml_pop_tag( struct MHTML_PARSER* parser ) {
244 MERROR_RETVAL retval = MERROR_OK;
245 union MHTML_TAG* tag_iter = NULL;
246
247 /* Move up from current tag. */
248 assert( parser->tag_iter >= 0 );
249 mdata_vector_lock( &(parser->tags) );
250 tag_iter = mdata_vector_get(
251 &(parser->tags), parser->tag_iter, union MHTML_TAG );
252 assert( NULL != tag_iter );
253
254 mhtml_parser_set_tag_iter( parser, tag_iter->base.parent );
255
256 if( 0 <= parser->tag_iter ) {
257 debug_printf( MHTML_TRACE_LVL,
258 "moved iter back to tag %s (" SIZE_T_FMT ")",
259 gc_mhtml_tag_names[tag_iter->base.type], parser->tag_iter );
260 } else {
261 debug_printf( MHTML_TRACE_LVL, "moved iter back to root (-1)" );
262 }
263
264cleanup:
265
266 mdata_vector_unlock( &(parser->tags) );
267
268 return retval;
269}
270
271MERROR_RETVAL mhtml_push_tag( struct MHTML_PARSER* parser ) {
272 MERROR_RETVAL retval = MERROR_OK;
273 ssize_t new_tag_idx = -1;
274 ssize_t next_sibling_idx = -1;
275 union MHTML_TAG tag_new;
276 union MHTML_TAG* p_tag_new = NULL;
277 union MHTML_TAG* p_tag_iter = NULL;
278
279 maug_mzero( &tag_new, sizeof( union MHTML_TAG ) );
280 tag_new.base.parent = -1;
281 tag_new.base.first_child = -1;
282 tag_new.base.next_sibling = -1;
283 tag_new.base.style = -1;
284 tag_new.base.flags = parser->tag_flags;
285 parser->tag_flags = 0;
286
287 new_tag_idx = mdata_vector_append(
288 &(parser->tags), &tag_new, sizeof( union MHTML_TAG ) );
289 if( 0 > new_tag_idx ) {
290 retval = mdata_retval( new_tag_idx );
291 goto cleanup;
292 }
293
294 mdata_vector_lock( &(parser->tags) );
295 p_tag_new = mdata_vector_get(
296 &(parser->tags), new_tag_idx, union MHTML_TAG );
297 assert( NULL != p_tag_new );
298
299 if( 0 > parser->tag_iter ) {
300 mhtml_parser_set_tag_iter( parser, new_tag_idx );
301 goto cleanup;
302 }
303
304 /* Get the current tag_iter. */
305 p_tag_iter = mdata_vector_get(
306 &(parser->tags), parser->tag_iter, union MHTML_TAG );
307 assert( NULL != p_tag_iter );
308
309 /* Set new tag parent to current tag. */
310 p_tag_new->base.parent = parser->tag_iter;
311
312 /* Add new tag to current tag's children. */
313 if( 0 > p_tag_iter->base.first_child ) {
314 debug_printf( MHTML_TRACE_LVL,
315 "zxzx attached " SSIZE_T_FMT " as first child to "
316 SSIZE_T_FMT, new_tag_idx, parser->tag_iter );
317 p_tag_iter->base.first_child = new_tag_idx;
318 } else {
319 /* Find the last sibling child. */
320 next_sibling_idx = p_tag_iter->base.first_child;
321 p_tag_iter = mdata_vector_get(
322 &(parser->tags), next_sibling_idx, union MHTML_TAG );
323 while( NULL != p_tag_iter && 0 <= p_tag_iter->base.next_sibling ) {
324 next_sibling_idx = p_tag_iter->base.next_sibling;
325 p_tag_iter = mdata_vector_get(
326 &(parser->tags), next_sibling_idx, union MHTML_TAG );
327 }
328 assert( NULL != p_tag_iter );
329 p_tag_iter->base.next_sibling = new_tag_idx;
330 debug_printf( MHTML_TRACE_LVL,
331 "attached " SSIZE_T_FMT " as next sibling to "
332 SSIZE_T_FMT, new_tag_idx, next_sibling_idx );
333 }
334
335 debug_printf( MHTML_TRACE_LVL,
336 "pushed new tag " SSIZE_T_FMT " under " SSIZE_T_FMT,
337 new_tag_idx, p_tag_new->base.parent );
338
339 mhtml_parser_set_tag_iter( parser, new_tag_idx );
340
341cleanup:
342
343 mdata_vector_unlock( &(parser->tags) );
344
345 return retval;
346}
347
348MERROR_RETVAL mhtml_push_element_tag( struct MHTML_PARSER* parser ) {
349 MERROR_RETVAL retval = MERROR_OK;
350 size_t i = 0;
351 union MHTML_TAG* p_tag_iter = NULL;
352
353 mparser_token_upper( &((parser)->base), i );
354
355 if( 0 == strncmp( "STYLE", parser->base.token, 6 ) ) {
356 /* Special case: style tag. Don't push a new tag here, but set a flag for
357 * the text tag next created by mhtml_push_tag() so the contents are
358 * directly attached to the style tag.
359 */
360 parser->tag_flags |= MHTML_TAG_FLAG_STYLE;
361 goto cleanup;
362 }
363
364 retval = mhtml_push_tag( parser );
365 maug_cleanup_if_not_ok();
366
367 mdata_vector_lock( &(parser->tags) );
368
369 p_tag_iter = mdata_vector_get(
370 &(parser->tags), parser->tag_iter, union MHTML_TAG );
371 assert( NULL != p_tag_iter );
372
373 /* Figure out tag type. */
374 i = 0;
375 while( '\0' != gc_mhtml_tag_names[i][0] ) {
376 if(
377 parser->base.token_sz == maug_strlen( gc_mhtml_tag_names[i] ) &&
378 0 == strncmp(
379 gc_mhtml_tag_names[i], parser->base.token, parser->base.token_sz )
380 ) {
381 debug_printf( MHTML_TRACE_LVL,
382 "new tag (" SSIZE_T_FMT ") type: %s",
383 parser->tag_iter, gc_mhtml_tag_names[i] );
384 p_tag_iter->base.type = i;
385
386 if( MHTML_TAG_TYPE_BODY == i ) {
387 /* Special case: body tag. Keep track of it for later so it can
388 * be passed to the renderer.
389 */
390 assert( -1 == parser->body_idx );
391 parser->body_idx = parser->tag_iter;
392 debug_printf( MHTML_TRACE_LVL,
393 "set body index to: " SSIZE_T_FMT,
394 parser->body_idx );
395 }
396
397 goto cleanup;
398 }
399 i++;
400 }
401
402 error_printf( "could not find type for new tag (" SSIZE_T_FMT ")",
403 parser->tag_iter );
404
405cleanup:
406
407 if( mdata_vector_is_locked( &(parser->tags) ) ) {
408 mdata_vector_unlock( &(parser->tags) );
409 }
410
411 return retval;
412}
413
414MERROR_RETVAL mhtml_push_text_tag( struct MHTML_PARSER* parser ) {
415 MERROR_RETVAL retval = MERROR_OK;
416 size_t i = 0;
417 union MHTML_TAG* p_tag_iter = NULL;
418
419 retval = mhtml_push_tag( parser );
420 maug_cleanup_if_not_ok();
421
422 mdata_vector_lock( &(parser->tags) );
423
424 p_tag_iter = mdata_vector_get(
425 &(parser->tags), parser->tag_iter, union MHTML_TAG );
426 assert( NULL != p_tag_iter );
427
428 if(
429 MHTML_TAG_FLAG_STYLE == (MHTML_TAG_FLAG_STYLE &
430 p_tag_iter->base.flags)
431 ) {
432 p_tag_iter->base.type = MHTML_TAG_TYPE_STYLE;
433 } else {
434 p_tag_iter->base.type = MHTML_TAG_TYPE_TEXT;
435 }
436
437 if( MHTML_TAG_TYPE_STYLE == p_tag_iter->base.type ) {
438 /* TODO: If it's the last character and there's still a token, process it! */
439 debug_printf( MHTML_TRACE_LVL, "parsing STYLE tag..." );
440 for( ; parser->base.token_sz > i ; i++ ) {
441 retval = mcss_parse_c( &(parser->styler), parser->base.token[i] );
442 maug_cleanup_if_not_ok();
443 }
444 debug_printf( 1, "out of style characters..." );
445 mcss_parser_flush( &(parser->styler) );
446 mcss_parser_reset( &(parser->styler) );
447 } else {
448 /* Eliminate trailing spaces. */
449 while( ' ' == parser->base.token[parser->base.token_sz - 1] ) {
450 parser->base.token_sz--;
451 }
452
453 /* Copy token to tag text. */
454 p_tag_iter->TEXT.content_idx = mdata_strpool_append(
455 &(parser->strpool), parser->base.token, parser->base.token_sz,
456 MDATA_STRPOOL_FLAG_DEDUPE );
457 maug_cleanup_if_eq(
458 p_tag_iter->TEXT.content_idx, 0, SIZE_T_FMT, MERROR_ALLOC );
459 p_tag_iter->TEXT.content_sz = parser->base.token_sz;
460 }
461
462 debug_printf( 1, "done processing tag contents..." );
463
464cleanup:
465
466 if( mdata_vector_is_locked( &(parser->tags) ) ) {
467 mdata_vector_unlock( &(parser->tags) );
468 }
469
470 return retval;
471}
472
473MERROR_RETVAL mhtml_push_attrib_key( struct MHTML_PARSER* parser ) {
474 MERROR_RETVAL retval = MERROR_OK;
475 size_t i = 0;
476
477 debug_printf( MHTML_TRACE_LVL, "attrib: %s", parser->base.token );
478
479 mparser_token_upper( &((parser)->base), i );
480
481 /* Figure out attrib type. */
482 i = 0;
483 while( '\0' != gc_mhtml_attrib_names[i][0] ) {
484 if(
485 parser->base.token_sz == maug_strlen( gc_mhtml_attrib_names[i] ) &&
486 0 == strncmp(
487 gc_mhtml_attrib_names[i], parser->base.token, parser->base.token_sz )
488 ) {
489 debug_printf(
490 MHTML_TRACE_LVL, "new attrib type: %s", gc_mhtml_attrib_names[i] );
491 parser->attrib_key = i;
492 goto cleanup;
493 }
494 i++;
495 }
496
497 error_printf( "unknown attrib: %s", parser->base.token );
498
499cleanup:
500
501 return retval;
502}
503
504static MERROR_RETVAL _mhtml_set_attrib_val( struct MHTML_PARSER* parser ) {
505 MERROR_RETVAL retval = MERROR_OK;
506 size_t i = 0;
507 union MHTML_TAG* p_tag_iter = NULL;
508
509 mdata_vector_lock( &(parser->tags) );
510
511 p_tag_iter = mdata_vector_get(
512 &(parser->tags), parser->tag_iter, union MHTML_TAG );
513 assert( NULL != p_tag_iter );
514
515 if( MHTML_ATTRIB_KEY_STYLE == parser->attrib_key ) {
516 debug_printf( MHTML_TRACE_LVL, "style: %s", parser->base.token );
517 /* TODO: Parse and attach style. */
518
519 /* Create an empty new style. */
520 mdata_vector_unlock( &(parser->tags) );
521 retval = mcss_push_style( &(parser->styler), MCSS_SELECT_NONE, NULL, 0 );
522 maug_cleanup_if_not_ok();
523 mdata_vector_lock( &(parser->tags) );
524
525 /* Set the new style as this tag's explicit style. */
526 p_tag_iter->base.style =
527 mdata_vector_ct( &(parser->styler.styles) ) - 1;
528
529 for( ; parser->base.token_sz > i ; i++ ) {
530 retval = mcss_parse_c( &(parser->styler), parser->base.token[i] );
531 maug_cleanup_if_not_ok();
532 }
533
534 debug_printf( 1, "out of style characters..." );
535 mcss_parser_flush( &(parser->styler) );
536
537 goto cleanup;
538
539 } else if( MHTML_ATTRIB_KEY_CLASS == parser->attrib_key ) {
540 maug_strncpy(
541 p_tag_iter->base.classes,
542 parser->base.token,
543 MCSS_CLASS_SZ_MAX );
544 p_tag_iter->base.classes_sz = parser->base.token_sz;
545
546 } else if( MHTML_ATTRIB_KEY_ID == parser->attrib_key ) {
547 maug_strncpy(
548 p_tag_iter->base.id,
549 parser->base.token,
550 MCSS_ID_SZ_MAX );
551 p_tag_iter->base.id_sz = parser->base.token_sz;
552
553 } else if( MHTML_ATTRIB_KEY_SRC == parser->attrib_key ) {
554 /* TODO: Validate tag type. */
555 maug_strncpy(
556 p_tag_iter->IMG.src,
557 parser->base.token,
558 MHTML_SRC_HREF_SZ_MAX );
559 p_tag_iter->IMG.src_sz = parser->base.token_sz;
560
561 } else if( MHTML_ATTRIB_KEY_TYPE == parser->attrib_key ) {
562 /* TODO: Validate tag type. */
563
564 maug_strncpy( parser->base.token, "button", 7 );
565 /* if( 0 == ) { */ /* TODO: Why were we checking this? */
566 p_tag_iter->INPUT.input_type =
567 MHTML_INPUT_TYPE_BUTTON;
568
569 } else if( MHTML_ATTRIB_KEY_NAME == parser->attrib_key ) {
570 /* TODO: Validate tag type. */
571 maug_strncpy(
572 p_tag_iter->INPUT.name,
573 parser->base.token,
574 MCSS_ID_SZ_MAX );
575 p_tag_iter->INPUT.name_sz = parser->base.token_sz;
576
577 } else if( MHTML_ATTRIB_KEY_VALUE == parser->attrib_key ) {
578 /* TODO: Validate tag type. */
579 maug_strncpy(
580 p_tag_iter->INPUT.value,
581 parser->base.token,
582 MCSS_ID_SZ_MAX );
583 p_tag_iter->INPUT.value_sz = parser->base.token_sz;
584 }
585
586cleanup:
587
588 if( mdata_vector_is_locked( &(parser->tags) ) ) {
589 mdata_vector_unlock( &(parser->tags) );
590 }
591
592 return retval;
593}
594
595MERROR_RETVAL mhtml_parse_c( struct MHTML_PARSER* parser, char c ) {
596 MERROR_RETVAL retval = MERROR_OK;
597 union MHTML_TAG* p_tag_iter = NULL;
598 size_t tag_iter_type = 0;
599
600 switch( c ) {
601 case '<':
602 if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
603 if( 0 < parser->base.token_sz ) {
604 retval = mhtml_push_text_tag( parser );
605 maug_cleanup_if_not_ok();
606
607 /* Grab the current tag to check its type below. */
608 mdata_vector_lock( &(parser->tags) );
609 p_tag_iter = mdata_vector_get(
610 &(parser->tags), parser->tag_iter, union MHTML_TAG );
611 assert( NULL != p_tag_iter );
612 tag_iter_type = p_tag_iter->base.type;
613 mdata_vector_unlock( &(parser->tags) );
614
615 if(
616 /* See special exception in mhtml_push_tag(). Style tags don't
617 * push their subordinate text, so popping here would be
618 * uneven!
619 */
620 MHTML_TAG_TYPE_STYLE != tag_iter_type
621 ) {
622 /* Pop out of text so next tag isn't a child of it. */
623 retval = mhtml_pop_tag( parser );
624 maug_cleanup_if_not_ok();
625 }
626 }
627 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ELEMENT );
628 maug_cleanup_if_not_ok();
629 mhtml_parser_reset_token( parser );
630
631 } else {
632 mhtml_parser_invalid_c( parser, c, retval );
633 }
634 break;
635
636 case '>':
637 if( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) ) {
638 retval = mhtml_push_element_tag( parser );
639 maug_cleanup_if_not_ok();
640 mhtml_parser_pstate_pop( parser );
641 mhtml_parser_reset_token( parser );
642
643 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
644 mhtml_parser_pstate_pop( parser );
645 assert( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) );
646 mhtml_parser_pstate_pop( parser ); /* Pop element. */
647 mhtml_parser_reset_token( parser );
648
649 } else if( MHTML_PSTATE_END_ELEMENT == mhtml_parser_pstate( parser ) ) {
650
651 retval = mhtml_pop_tag( parser );
652 maug_cleanup_if_not_ok();
653
654 mhtml_parser_pstate_pop( parser );
655 if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
656 mhtml_parser_pstate_pop( parser );
657 }
658 assert( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) );
659 mhtml_parser_pstate_pop( parser ); /* Pop element. */
660 mhtml_parser_reset_token( parser );
661
662 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
663 retval = mhtml_parser_append_token( parser, c );
664 maug_cleanup_if_not_ok();
665
666 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
667 retval = mhtml_parser_append_token( parser, c );
668 maug_cleanup_if_not_ok();
669
670 } else {
671 mhtml_parser_invalid_c( parser, c, retval );
672 }
673 break;
674
675 case '/':
676 if(
677 MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) &&
678 0 == parser->base.token_sz
679 ) {
680 /* Start of a close tag. */
681 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_END_ELEMENT );
682 maug_cleanup_if_not_ok();
683
684 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
685 /* Close of a self-closing tag. */
686 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_END_ELEMENT );
687 maug_cleanup_if_not_ok();
688
689 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
690 retval = mhtml_parser_append_token( parser, c );
691 maug_cleanup_if_not_ok();
692
693 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
694 retval = mhtml_parser_append_token( parser, c );
695 maug_cleanup_if_not_ok();
696
697 } else {
698 mhtml_parser_invalid_c( parser, c, retval );
699 }
700 break;
701
702 case '=':
703 if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
704 retval = mhtml_push_attrib_key( parser );
705 maug_cleanup_if_not_ok();
706 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ATTRIB_VAL );
707 maug_cleanup_if_not_ok();
708 mhtml_parser_reset_token( parser );
709
710 } else if( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) ) {
711 retval = mhtml_parser_append_token( parser, c );
712 maug_cleanup_if_not_ok();
713
714 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
715 retval = mhtml_parser_append_token( parser, c );
716 maug_cleanup_if_not_ok();
717
718 } else {
719 mhtml_parser_invalid_c( parser, '_', retval );
720 }
721 break;
722
723 case '"':
724 if( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) ) {
725 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_STRING );
726 maug_cleanup_if_not_ok();
727 mhtml_parser_reset_token( parser );
728
729 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
730 retval = _mhtml_set_attrib_val( parser );
731 maug_cleanup_if_not_ok();
732 mhtml_parser_pstate_pop( parser );
733 assert( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) );
734 mhtml_parser_pstate_pop( parser );
735 mhtml_parser_reset_token( parser );
736
737 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
738 retval = mhtml_parser_append_token( parser, c );
739 maug_cleanup_if_not_ok();
740
741 } else {
742 mhtml_parser_invalid_c( parser, '_', retval );
743 }
744 break;
745
746 case '\r':
747 case '\n':
748 case '\t':
749 break;
750
751 case ' ':
752 if( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) ) {
753 retval = mhtml_push_element_tag( parser );
754 maug_cleanup_if_not_ok();
755 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ATTRIB_KEY );
756 maug_cleanup_if_not_ok();
757 mhtml_parser_reset_token( parser );
758
759 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
760 retval = mhtml_parser_append_token( parser, c );
761 maug_cleanup_if_not_ok();
762
763 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
764 /* Do nothing. */
765
766 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
767 /* Avoid a token that's only whitespace. */
768 if(
769 0 < parser->base.token_sz &&
770 ' ' != parser->base.token[parser->base.token_sz - 1]
771 ) {
772 retval = mhtml_parser_append_token( parser, ' ' );
773 maug_cleanup_if_not_ok();
774 }
775
776 } else {
777 mhtml_parser_invalid_c( parser, '_', retval );
778 }
779 break;
780
781 default:
782 retval = mhtml_parser_append_token( parser, c );
783 maug_cleanup_if_not_ok();
784 break;
785 }
786
787 parser->base.i++;
788
789 mparser_wait( &((parser)->base) );
790
791cleanup:
792
793 parser->base.last_c = c;
794
795 if( mdata_vector_is_locked( &(parser->tags) ) ) {
796 mdata_vector_unlock( &(parser->tags) );
797 }
798
799 return retval;
800}
801
802MERROR_RETVAL mhtml_parser_init( struct MHTML_PARSER* parser ) {
803 MERROR_RETVAL retval = MERROR_OK;
804
805 /* Perform initial tag allocation. */
806 mhtml_parser_set_tag_iter( parser, -1 );
807 parser->body_idx = -1;
808
809 retval = mcss_parser_init( &(parser->styler) );
810 maug_cleanup_if_not_ok();
811
812cleanup:
813
814 return retval;
815}
816
817MERROR_RETVAL mhtml_dump_tree(
818 struct MHTML_PARSER* parser, ssize_t iter, size_t d
819) {
820 size_t i = 0;
821 char dump_line[MHTML_DUMP_LINE_SZ + 1];
822 union MHTML_TAG* p_tag_iter = NULL;
823 ssize_t first_child = -1;
824 ssize_t next_sibling = -1;
825 MERROR_RETVAL retval = MERROR_OK;
826 char* tag_contents = NULL;
827
828 if( 0 > iter ) {
829 return retval;
830 }
831
832 mdata_vector_lock( &(parser->tags) );
833
834 p_tag_iter = mdata_vector_get( &(parser->tags), iter, union MHTML_TAG );
835 assert( NULL != p_tag_iter );
836
837 maug_mzero( dump_line, MHTML_DUMP_LINE_SZ + 1 );
838
839 for( i = 0 ; d > i ; i++ ) {
840 assert( i < MHTML_DUMP_LINE_SZ );
841 strcat( dump_line, " " );
842 }
843 if( MHTML_TAG_TYPE_TEXT == p_tag_iter->base.type ) {
844 if( -1 == p_tag_iter->TEXT.content_idx ) {
845 error_printf( "no tag content present!" );
846 goto cleanup;
847 }
848
849 mdata_strpool_lock( &(parser->strpool) );
850
851 if(
852 maug_strlen( dump_line ) + 7 /* ("TEXT: \n") */
853 + p_tag_iter->TEXT.content_sz < MHTML_DUMP_LINE_SZ
854 ) {
855 strcat( dump_line, "TEXT: " );
856 tag_contents = mdata_strpool_get(
857 &(parser->strpool), p_tag_iter->TEXT.content_idx );
858 if( NULL == tag_contents ) {
859 error_printf( "could not retrieve tag contents!" );
860 retval = MERROR_ALLOC;
861 goto cleanup;
862 }
863 strcat( dump_line, tag_contents );
864 strcat( dump_line, "\n" );
865 }
866
867 mdata_strpool_unlock( &(parser->strpool) );
868
869 } else {
870 if(
871 maug_strlen( dump_line ) +
872 maug_strlen( gc_mhtml_tag_names[p_tag_iter->base.type] ) <
873 MHTML_DUMP_LINE_SZ
874 ) {
875 strcat( dump_line,
876 gc_mhtml_tag_names[p_tag_iter->base.type] );
877 }
878
879 if(
880 0 <= p_tag_iter->base.style &&
881 maug_strlen( dump_line ) + 9 /* (styled) */ < MHTML_DUMP_LINE_SZ
882 ) {
883 strcat( dump_line, " (styled)" );
884 }
885
886 if(
887 0 < p_tag_iter->base.id_sz &&
888 maug_strlen( dump_line ) + 7 /* (id: ) */
889 + maug_strlen( p_tag_iter->base.id ) < MHTML_DUMP_LINE_SZ
890 ) {
891 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
892 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
893 " (id: %s)", p_tag_iter->base.id );
894 }
895
896 if(
897 0 < p_tag_iter->base.classes_sz &&
898 maug_strlen( dump_line ) + 12 /* (classes: ) */
899 + maug_strlen( p_tag_iter->base.id ) < MHTML_DUMP_LINE_SZ
900 ) {
901 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
902 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
903 " (classes: %s)", p_tag_iter->base.classes );
904 }
905
906 if(
907 MHTML_TAG_TYPE_IMG == p_tag_iter->base.type &&
908 0 < p_tag_iter->IMG.src_sz &&
909 maug_strlen( dump_line ) + 8 /* (src: ) */
910 + maug_strlen( p_tag_iter->IMG.src ) < MHTML_DUMP_LINE_SZ
911 ) {
912 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
913 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
914 " (src: %s)", p_tag_iter->IMG.src );
915 }
916
917 if(
918 MHTML_TAG_TYPE_INPUT == p_tag_iter->base.type &&
919 0 < p_tag_iter->INPUT.value_sz &&
920 maug_strlen( dump_line ) + 10 /* (value: ) */
921 + maug_strlen( p_tag_iter->INPUT.value ) < MHTML_DUMP_LINE_SZ
922 ) {
923 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
924 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
925 " (value: %s)", p_tag_iter->INPUT.value );
926 }
927
928 }
929
930 debug_printf( 1, "%s", dump_line );
931
932 first_child = p_tag_iter->base.first_child;
933 next_sibling = p_tag_iter->base.next_sibling;
934
935 mdata_vector_unlock( &(parser->tags) );
936
937 retval = mhtml_dump_tree( parser, first_child, d + 1 );
938 maug_cleanup_if_not_ok();
939
940 retval = mhtml_dump_tree( parser, next_sibling, d );
941 maug_cleanup_if_not_ok();
942
943cleanup:
944
945 if( mdata_vector_is_locked( &(parser->tags) ) ) {
946 mdata_vector_unlock( &(parser->tags) );
947 }
948
949 return retval;
950}
951
952#else
953
954#define MHTML_TAG_TABLE_CONST( tag_id, tag_name, fields, disp ) \
955 extern MAUG_CONST uint16_t SEG_MCONST MHTML_TAG_TYPE_ ## tag_name;
956
957MHTML_TAG_TABLE( MHTML_TAG_TABLE_CONST )
958
959extern MAUG_CONST char* SEG_MCONST gc_mhtml_tag_names[];
960
961#endif /* MHTML_C */
962
963#endif /* !MHTML_H */
964
uint16_t MERROR_RETVAL
Return type indicating function returns a value from this list.
Definition: merror.h:19
ssize_t mdata_vector_append(struct MDATA_VECTOR *v, const void *item, size_t item_sz)
Append an item to the specified vector.
MERROR_RETVAL mdata_vector_remove(struct MDATA_VECTOR *v, size_t idx)
Remove item at the given index, shifting subsequent items up by 1.
A pool of immutable text strings. Deduplicates strings to save memory.
Definition: mdata.h:68
A vector of uniformly-sized objects, stored contiguously.
Definition: mdata.h:93
#define mdata_vector_lock(v)
Lock the vector. This should be done when items from the vector are actively being referenced,...
Definition: mdata.h:320
#define mdata_vector_unlock(v)
Unlock the vector so items may be added and removed.
Definition: mdata.h:353
#define mdata_vector_ct(v)
Number of items of MDATA_VECTOR::item_sz bytes actively stored in this vector.
Definition: mdata.h:396
Definition: mhtml.h:150
uint8_t tag_flags
Flags to be pushed to MHTML_TAG_BASE::flags on next mhtml_push_tag().
Definition: mhtml.h:158
Definition: mhtml.h:119
Definition: mhtml.h:145