1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2025, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 SWI-Prolog Solutions b.v. 10 All rights reserved. 11 12 Redistribution and use in source and binary forms, with or without 13 modification, are permitted provided that the following conditions 14 are met: 15 16 1. Redistributions of source code must retain the above copyright 17 notice, this list of conditions and the following disclaimer. 18 19 2. Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in 21 the documentation and/or other materials provided with the 22 distribution. 23 24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 27 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 28 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 29 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 32 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 34 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 POSSIBILITY OF SUCH DAMAGE. 36*/ 37 38:- module(sgml, 39 [ load_html/3, % +Input, -DOM, +Options 40 load_xml/3, % +Input, -DOM, +Options 41 load_sgml/3, % +Input, -DOM, +Options 42 43 load_sgml_file/2, % +File, -ListOfContent 44 load_xml_file/2, % +File, -ListOfContent 45 load_html_file/2, % +File, -Document 46 47 load_structure/3, % +File, -Term, +Options 48 49 load_dtd/2, % +DTD, +File 50 load_dtd/3, % +DTD, +File, +Options 51 dtd/2, % +Type, -DTD 52 dtd_property/2, % +DTD, ?Property 53 54 new_dtd/2, % +Doctype, -DTD 55 free_dtd/1, % +DTD 56 open_dtd/3, % +DTD, +Options, -Stream 57 58 new_sgml_parser/2, % -Parser, +Options 59 free_sgml_parser/1, % +Parser 60 set_sgml_parser/2, % +Parser, +Options 61 get_sgml_parser/2, % +Parser, +Options 62 sgml_parse/2, % +Parser, +Options 63 64 sgml_register_catalog_file/2, % +File, +StartOrEnd 65 66 xml_quote_attribute/3, % +In, -Quoted, +Encoding 67 xml_quote_cdata/3, % +In, -Quoted, +Encoding 68 xml_quote_attribute/2, % +In, -Quoted 69 xml_quote_cdata/2, % +In, -Quoted 70 xml_name/1, % +In 71 xml_name/2, % +In, +Encoding 72 73 xsd_number_string/2, % ?Number, ?String 74 xsd_time_string/3, % ?Term, ?Type, ?String 75 76 xml_basechar/1, % +Code 77 xml_ideographic/1, % +Code 78 xml_combining_char/1, % +Code 79 xml_digit/1, % +Code 80 xml_extender/1, % +Code 81 82 iri_xml_namespace/2, % +IRI, -Namespace 83 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 84 xml_is_dom/1 % +Term 85 ]). 86:- autoload(library(error),[instantiation_error/1]). 87:- autoload(library(iostream),[open_any/5,close_any/1]). 88:- autoload(library(lists),[member/2,selectchk/3]). 89:- autoload(library(option),[select_option/3,merge_options/3]). 90 91:- meta_predicate 92 load_structure( , , ), 93 load_html( , , ), 94 load_xml( , , ), 95 load_sgml( , , ). 96 97:- predicate_options(load_structure/3, 3, 98 [ charpos(integer), 99 cdata(oneof([atom,string])), 100 defaults(boolean), 101 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 102 doctype(atom), 103 dtd(any), 104 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 105 entity(atom,atom), 106 keep_prefix(boolean), 107 file(atom), 108 line(integer), 109 offset(integer), 110 number(oneof([token,integer])), 111 qualify_attributes(boolean), 112 shorttag(boolean), 113 case_sensitive_attributes(boolean), 114 case_preserving_attributes(boolean), 115 system_entities(boolean), 116 max_memory(integer), 117 ignore_doctype(boolean), 118 space(oneof([sgml,preserve,default,remove,strict])), 119 xmlns(atom), 120 xmlns(atom,atom), 121 pass_to(sgml_parse/2, 2) 122 ]). 123:- predicate_options(load_html/3, 3, 124 [ pass_to(load_structure/3, 3) 125 ]). 126:- predicate_options(load_xml/3, 3, 127 [ pass_to(load_structure/3, 3) 128 ]). 129:- predicate_options(load_sgml/3, 3, 130 [ pass_to(load_structure/3, 3) 131 ]). 132:- predicate_options(load_dtd/3, 3, 133 [ dialect(oneof([sgml,xml,xmlns])), 134 pass_to(open/4, 4) 135 ]). 136:- predicate_options(sgml_parse/2, 2, 137 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 138 callable), 139 cdata(oneof([atom,string])), 140 content_length(integer), 141 document(-any), 142 max_errors(integer), 143 parse(oneof([file,element,content,declaration,input])), 144 source(any), 145 syntax_errors(oneof([quiet,print,style])), 146 xml_no_ns(oneof([error,quiet])) 147 ]). 148:- predicate_options(new_sgml_parser/2, 2, 149 [ dtd(any) 150 ]).
180:- multifile user:file_search_path/2. 181:- dynamic user:file_search_path/2. 182 183user:file_search_path(dtd, library('DTD')). 184 185sgml_register_catalog_file(File, Location) :- 186 prolog_to_os_filename(File, OsFile), 187 '_sgml_register_catalog_file'(OsFile, Location). 188 189:- use_foreign_library(foreign(sgml2pl)). 190 191register_catalog(Base) :- 192 absolute_file_name(dtd(Base), SocFile, 193 [ extensions([soc]), 194 access(read), 195 file_errors(fail) 196 ]), 197 sgml_register_catalog_file(SocFile, end). 198 199:- initialization 200 ignore(register_catalog('HTML4')). 201 202 203 /******************************* 204 * DTD HANDLING * 205 *******************************/ 206 207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208Note that concurrent access to DTD objects is not allowed, and hence we 209will allocate and destroy them in each thread. Possibibly it would be 210nicer to find out why concurrent access to DTD's is flawed. It is 211diagnosed to mess with the entity resolution by Fabien Todescato. 212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 213 214:- thread_local 215 current_dtd/2. 216:- volatile 217 current_dtd/2. 218:- thread_local 219 registered_cleanup/0. 220:- volatile 221 registered_cleanup/0. 222 223:- multifile 224 dtd_alias/2. 225 226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227 228dtd_alias(html4, 'HTML4'). 229dtd_alias(html5, 'HTML5'). 230dtd_alias(html, DTD) :- 231 current_prolog_flag(html_dialect, Dialect), 232 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
244dtd(Type, DTD) :- 245 current_dtd(Type, DTD), 246 !. 247dtd(Type, DTD) :- 248 new_dtd(Type, DTD), 249 ( dtd_alias(Type, Base) 250 -> true 251 ; Base = Type 252 ), 253 absolute_file_name(dtd(Base), 254 [ extensions([dtd]), 255 access(read) 256 ], DtdFile), 257 load_dtd(DTD, DtdFile), 258 register_cleanup, 259 asserta(current_dtd(Type, DTD)).
274load_dtd(DTD, DtdFile) :- 275 load_dtd(DTD, DtdFile, []). 276load_dtd(DTD, DtdFile, Options) :- 277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 278 setup_call_cleanup( 279 open_dtd(DTD, DTDOptions, DtdOut), 280 setup_call_cleanup( 281 open(DtdFile, read, DtdIn, OpenOptions), 282 copy_stream_data(DtdIn, DtdOut), 283 close(DtdIn)), 284 close(DtdOut)).
291destroy_dtds :-
292 ( current_dtd(_Type, DTD),
293 free_dtd(DTD),
294 fail
295 ; true
296 ).
302register_cleanup :- 303 registered_cleanup, 304 !. 305register_cleanup :- 306 ( thread_self(main) 307 -> at_halt(destroy_dtds) 308 ; current_prolog_flag(threads, true) 309 -> prolog_listen(this_thread_exit, destroy_dtds) 310 ; true 311 ), 312 assert(registered_cleanup). 313 314 315 /******************************* 316 * EXAMINE DTD * 317 *******************************/ 318 319prop(doctype(_), _). 320prop(elements(_), _). 321prop(entities(_), _). 322prop(notations(_), _). 323prop(entity(E, _), DTD) :- 324 ( nonvar(E) 325 -> true 326 ; '$dtd_property'(DTD, entities(EL)), 327 member(E, EL) 328 ). 329prop(element(E, _, _), DTD) :- 330 ( nonvar(E) 331 -> true 332 ; '$dtd_property'(DTD, elements(EL)), 333 member(E, EL) 334 ). 335prop(attributes(E, _), DTD) :- 336 ( nonvar(E) 337 -> true 338 ; '$dtd_property'(DTD, elements(EL)), 339 member(E, EL) 340 ). 341prop(attribute(E, A, _, _), DTD) :- 342 ( nonvar(E) 343 -> true 344 ; '$dtd_property'(DTD, elements(EL)), 345 member(E, EL) 346 ), 347 ( nonvar(A) 348 -> true 349 ; '$dtd_property'(DTD, attributes(E, AL)), 350 member(A, AL) 351 ). 352prop(notation(N, _), DTD) :- 353 ( nonvar(N) 354 -> true 355 ; '$dtd_property'(DTD, notations(NL)), 356 member(N, NL) 357 ). 358 359dtd_property(DTD, Prop) :- 360 prop(Prop, DTD), 361 '$dtd_property'(DTD, Prop). 362 363 364 /******************************* 365 * SGML * 366 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.390load_structure(Spec, DOM, Options) :- 391 sgml_open_options(Options, OpenOptions, SGMLOptions), 392 setup_call_cleanup( 393 open_any(Spec, read, In, Close, OpenOptions), 394 load_structure_from_stream(In, DOM, SGMLOptions), 395 close_any(Close)). 396 397sgml_open_options(Options, OpenOptions, SGMLOptions) :- 398 Options = M:Plain, 399 ( select_option(encoding(Encoding), Plain, NoEnc) 400 -> ( sgml_encoding(Encoding) 401 -> merge_options(NoEnc, [type(binary)], OpenOptions), 402 SGMLOptions = Options 403 ; OpenOptions = Plain, 404 SGMLOptions = M:NoEnc 405 ) 406 ; merge_options(Plain, [type(binary)], OpenOptions), 407 SGMLOptions = Options 408 ). 409 410sgml_encoding(Enc) :- 411 downcase_atom(Enc, Enc1), 412 sgml_encoding_l(Enc1). 413 414sgml_encoding_l('iso-8859-1'). 415sgml_encoding_l('us-ascii'). 416sgml_encoding_l('utf-8'). 417sgml_encoding_l('utf8'). 418sgml_encoding_l('iso_latin_1'). 419sgml_encoding_l('ascii'). 420 421load_structure_from_stream(In, Term, M:Options) :- 422 ( select_option(dtd(DTD), Options, Options1) 423 -> ExplicitDTD = true 424 ; ExplicitDTD = false, 425 Options1 = Options 426 ), 427 move_front(Options1, dialect(_), Options2), % dialect sets defaults 428 setup_call_cleanup( 429 new_sgml_parser(Parser, 430 [ dtd(DTD) 431 ]), 432 parse(Parser, M:Options2, TermRead, In), 433 free_sgml_parser(Parser)), 434 ( ExplicitDTD == true 435 -> ( DTD = dtd(_, DocType), 436 dtd_property(DTD, doctype(DocType)) 437 -> true 438 ; true 439 ) 440 ; free_dtd(DTD) 441 ), 442 Term = TermRead. 443 444move_front(Options0, Opt, Options) :- 445 selectchk(Opt, Options0, Options1), 446 !, 447 Options = [Opt|Options1]. 448move_front(Options, _, Options). 449 450 451parse(Parser, M:Options, Document, In) :- 452 set_parser_options(Options, Parser, In, Options1), 453 parser_meta_options(Options1, M, Options2), 454 set_input_location(Parser, In), 455 sgml_parse(Parser, 456 [ document(Document), 457 source(In) 458 | Options2 459 ]). 460 461set_parser_options([], _, _, []). 462set_parser_options([H|T], Parser, In, Rest) :- 463 ( set_parser_option(H, Parser, In) 464 -> set_parser_options(T, Parser, In, Rest) 465 ; Rest = [H|R2], 466 set_parser_options(T, Parser, In, R2) 467 ). 468 469set_parser_option(Var, _Parser, _In) :- 470 var(Var), 471 !, 472 instantiation_error(Var). 473set_parser_option(Option, Parser, _) :- 474 def_entity(Option, Parser), 475 !. 476set_parser_option(offset(Offset), _Parser, In) :- 477 !, 478 seek(In, Offset, bof, _). 479set_parser_option(Option, Parser, _In) :- 480 parser_option(Option), 481 !, 482 set_sgml_parser(Parser, Option). 483set_parser_option(Name=Value, Parser, In) :- 484 Option =.. [Name,Value], 485 set_parser_option(Option, Parser, In). 486 487 488parser_option(dialect(_)). 489parser_option(shorttag(_)). 490parser_option(case_sensitive_attributes(_)). 491parser_option(case_preserving_attributes(_)). 492parser_option(system_entities(_)). 493parser_option(max_memory(_)). 494parser_option(ignore_doctype(_)). 495parser_option(file(_)). 496parser_option(line(_)). 497parser_option(space(_)). 498parser_option(number(_)). 499parser_option(defaults(_)). 500parser_option(doctype(_)). 501parser_option(qualify_attributes(_)). 502parser_option(encoding(_)). 503parser_option(keep_prefix(_)). 504 505 506def_entity(entity(Name, Value), Parser) :- 507 get_sgml_parser(Parser, dtd(DTD)), 508 xml_quote_attribute(Value, QValue), 509 setup_call_cleanup(open_dtd(DTD, [], Stream), 510 format(Stream, '<!ENTITY ~w "~w">~n', 511 [Name, QValue]), 512 close(Stream)). 513def_entity(xmlns(URI), Parser) :- 514 set_sgml_parser(Parser, xmlns(URI)). 515def_entity(xmlns(NS, URI), Parser) :- 516 set_sgml_parser(Parser, xmlns(NS, URI)).
522parser_meta_options([], _, []). 523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 524 !, 525 parser_meta_options(T0, M, T). 526parser_meta_options([H|T0], M, [H|T]) :- 527 parser_meta_options(T0, M, T).
534set_input_location(Parser, _In) :- 535 get_sgml_parser(Parser, file(_)), 536 !. 537set_input_location(Parser, In) :- 538 stream_property(In, file_name(File)), 539 !, 540 set_sgml_parser(Parser, file(File)), 541 stream_property(In, position(Pos)), 542 set_sgml_parser(Parser, position(Pos)). 543set_input_location(_, _). 544 545 /******************************* 546 * UTILITIES * 547 *******************************/
556load_sgml_file(File, Term) :-
557 load_sgml(File, Term, []).
566load_xml_file(File, Term) :-
567 load_xml(File, Term, []).
576load_html_file(File, DOM) :-
577 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
606load_html(File, Term, M:Options) :-
607 current_prolog_flag(html_dialect, Dialect),
608 dtd(Dialect, DTD),
609 merge_options(Options,
610 [ dtd(DTD),
611 dialect(Dialect),
612 max_errors(-1),
613 syntax_errors(quiet)
614 ], Options1),
615 load_structure(File, Term, M:Options1).
dialect(xml)
625load_xml(Input, DOM, M:Options) :-
626 merge_options(Options,
627 [ dialect(xml)
628 ], Options1),
629 load_structure(Input, DOM, M:Options1).
dialect(sgml)
639load_sgml(Input, DOM, M:Options) :- 640 merge_options(Options, 641 [ dialect(sgml) 642 ], Options1), 643 load_structure(Input, DOM, M:Options1). 644 645 646 647 /******************************* 648 * ENCODING * 649 *******************************/
659xml_quote_attribute(In, Quoted) :- 660 xml_quote_attribute(In, Quoted, ascii). 661 662xml_quote_cdata(In, Quoted) :- 663 xml_quote_cdata(In, Quoted, ascii).
669xml_name(In) :- 670 xml_name(In, ascii). 671 672 673 /******************************* 674 * XML CHARACTER CLASSES * 675 *******************************/
690 /******************************* 691 * TYPE CHECKING * 692 *******************************/
699xml_is_dom(0) :- !, fail. % catch variables 700xml_is_dom(List) :- 701 is_list(List), 702 !, 703 xml_is_content_list(List). 704xml_is_dom(Term) :- 705 xml_is_element(Term). 706 707xml_is_content_list([]). 708xml_is_content_list([H|T]) :- 709 xml_is_content(H), 710 xml_is_content_list(T). 711 712xml_is_content(0) :- !, fail. 713xml_is_content(pi(Pi)) :- 714 !, 715 atom(Pi). 716xml_is_content(CDATA) :- 717 atom(CDATA), 718 !. 719xml_is_content(CDATA) :- 720 string(CDATA), 721 !. 722xml_is_content(Term) :- 723 xml_is_element(Term). 724 725xml_is_element(element(Name, Attributes, Content)) :- 726 dom_name(Name), 727 dom_attributes(Attributes), 728 xml_is_content_list(Content). 729 730dom_name(NS:Local) :- 731 atom(NS), 732 atom(Local), 733 !. 734dom_name(Local) :- 735 atom(Local). 736 737dom_attributes(0) :- !, fail. 738dom_attributes([]). 739dom_attributes([H|T]) :- 740 dom_attribute(H), 741 dom_attributes(T). 742 743dom_attribute(Name=Value) :- 744 dom_name(Name), 745 atomic(Value). 746 747 748 /******************************* 749 * MESSAGES * 750 *******************************/ 751:- multifile 752 prolog:message/3. 753 754% Catch messages. sgml/4 is generated by the SGML2PL binding. 755 756prologmessage(sgml(Parser, File, Line, Message)) --> 757 { get_sgml_parser(Parser, dialect(Dialect)) 758 }, 759 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 760 761 762 /******************************* 763 * XREF SUPPORT * 764 *******************************/ 765 766:- multifile 767 prolog:called_by/2. 768 769prologcalled_by(sgml_parse(_, Options), Called) :- 770 findall(Meta, meta_call_term(_, Meta, Options), Called). 771 772meta_call_term(T, G+N, Options) :- 773 T = call(Event, G), 774 pmember(T, Options), 775 call_params(Event, Term), 776 functor(Term, _, N). 777 778pmember(X, List) :- % member for partial lists 779 nonvar(List), 780 List = [H|T], 781 ( X = H 782 ; pmember(X, T) 783 ). 784 785call_params(begin, begin(tag,attributes,parser)). 786call_params(end, end(tag,parser)). 787call_params(cdata, cdata(cdata,parser)). 788call_params(pi, pi(cdata,parser)). 789call_params(decl, decl(cdata,parser)). 790call_params(error, error(severity,message,parser)). 791call_params(xmlns, xmlns(namespace,url,parser)). 792call_params(urlns, urlns(url,url,parser)). 793 794 /******************************* 795 * SANDBOX * 796 *******************************/ 797 798:- multifile 799 sandbox:safe_primitive/1, 800 sandbox:safe_meta_predicate/1. 801 802sandbox:safe_meta_predicate(sgml:load_structure/3). 803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 804 dtd_alias(Dialect, _). 805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 807sandbox:safe_primitive(sgml:xml_name(_,_)). 808sandbox:safe_primitive(sgml:xml_basechar(_)). 809sandbox:safe_primitive(sgml:xml_ideographic(_)). 810sandbox:safe_primitive(sgml:xml_combining_char(_)). 811sandbox:safe_primitive(sgml:xml_digit(_)). 812sandbox:safe_primitive(sgml:xml_extender(_)). 813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 814sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.