View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2000-2025, University of Amsterdam
    7                              VU University Amsterdam
    8                              CWI, Amsterdam
    9                              SWI-Prolog Solutions b.v.
   10    All rights reserved.
   11
   12    Redistribution and use in source and binary forms, with or without
   13    modification, are permitted provided that the following conditions
   14    are met:
   15
   16    1. Redistributions of source code must retain the above copyright
   17       notice, this list of conditions and the following disclaimer.
   18
   19    2. Redistributions in binary form must reproduce the above copyright
   20       notice, this list of conditions and the following disclaimer in
   21       the documentation and/or other materials provided with the
   22       distribution.
   23
   24    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   25    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   26    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   27    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   28    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   29    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   30    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   31    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   32    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   34    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   35    POSSIBILITY OF SUCH DAMAGE.
   36*/
   37
   38:- module(sgml,
   39          [ load_html/3,                % +Input, -DOM, +Options
   40            load_xml/3,                 % +Input, -DOM, +Options
   41            load_sgml/3,                % +Input, -DOM, +Options
   42
   43            load_sgml_file/2,           % +File, -ListOfContent
   44            load_xml_file/2,            % +File, -ListOfContent
   45            load_html_file/2,           % +File, -Document
   46
   47            load_structure/3,           % +File, -Term, +Options
   48
   49            load_dtd/2,                 % +DTD, +File
   50            load_dtd/3,                 % +DTD, +File, +Options
   51            dtd/2,                      % +Type, -DTD
   52            dtd_property/2,             % +DTD, ?Property
   53
   54            new_dtd/2,                  % +Doctype, -DTD
   55            free_dtd/1,                 % +DTD
   56            open_dtd/3,                 % +DTD, +Options, -Stream
   57
   58            new_sgml_parser/2,          % -Parser, +Options
   59            free_sgml_parser/1,         % +Parser
   60            set_sgml_parser/2,          % +Parser, +Options
   61            get_sgml_parser/2,          % +Parser, +Options
   62            sgml_parse/2,               % +Parser, +Options
   63
   64            sgml_register_catalog_file/2, % +File, +StartOrEnd
   65
   66            xml_quote_attribute/3,      % +In, -Quoted, +Encoding
   67            xml_quote_cdata/3,          % +In, -Quoted, +Encoding
   68            xml_quote_attribute/2,      % +In, -Quoted
   69            xml_quote_cdata/2,          % +In, -Quoted
   70            xml_name/1,                 % +In
   71            xml_name/2,                 % +In, +Encoding
   72
   73            xsd_number_string/2,        % ?Number, ?String
   74            xsd_time_string/3,          % ?Term, ?Type, ?String
   75
   76            xml_basechar/1,             % +Code
   77            xml_ideographic/1,          % +Code
   78            xml_combining_char/1,       % +Code
   79            xml_digit/1,                % +Code
   80            xml_extender/1,             % +Code
   81
   82            iri_xml_namespace/2,        % +IRI, -Namespace
   83            iri_xml_namespace/3,        % +IRI, -Namespace, -LocalName
   84            xml_is_dom/1                % +Term
   85          ]).   86:- autoload(library(error),[instantiation_error/1]).   87:- autoload(library(iostream),[open_any/5,close_any/1]).   88:- autoload(library(lists),[member/2,selectchk/3]).   89:- autoload(library(option),[select_option/3,merge_options/3]).   90
   91:- meta_predicate
   92    load_structure(+, -, :),
   93    load_html(+, -, :),
   94    load_xml(+, -, :),
   95    load_sgml(+, -, :).   96
   97:- predicate_options(load_structure/3, 3,
   98                     [ charpos(integer),
   99                       cdata(oneof([atom,string])),
  100                       defaults(boolean),
  101                       dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
  102                       doctype(atom),
  103                       dtd(any),
  104                       encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
  105                       entity(atom,atom),
  106                       keep_prefix(boolean),
  107                       file(atom),
  108                       line(integer),
  109                       offset(integer),
  110                       number(oneof([token,integer])),
  111                       qualify_attributes(boolean),
  112                       shorttag(boolean),
  113                       case_sensitive_attributes(boolean),
  114                       case_preserving_attributes(boolean),
  115                       system_entities(boolean),
  116                       max_memory(integer),
  117                       ignore_doctype(boolean),
  118                       space(oneof([sgml,preserve,default,remove,strict])),
  119                       xmlns(atom),
  120                       xmlns(atom,atom),
  121                       pass_to(sgml_parse/2, 2)
  122                     ]).  123:- predicate_options(load_html/3, 3,
  124                     [ pass_to(load_structure/3, 3)
  125                     ]).  126:- predicate_options(load_xml/3, 3,
  127                     [ pass_to(load_structure/3, 3)
  128                     ]).  129:- predicate_options(load_sgml/3, 3,
  130                     [ pass_to(load_structure/3, 3)
  131                     ]).  132:- predicate_options(load_dtd/3, 3,
  133                     [ dialect(oneof([sgml,xml,xmlns])),
  134                       pass_to(open/4, 4)
  135                     ]).  136:- predicate_options(sgml_parse/2, 2,
  137                     [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
  138                            callable),
  139                       cdata(oneof([atom,string])),
  140                       content_length(integer),
  141                       document(-any),
  142                       max_errors(integer),
  143                       parse(oneof([file,element,content,declaration,input])),
  144                       source(any),
  145                       syntax_errors(oneof([quiet,print,style])),
  146                       xml_no_ns(oneof([error,quiet]))
  147                     ]).  148:- predicate_options(new_sgml_parser/2, 2,
  149                     [ dtd(any)
  150                     ]).  151
  152
  153/** <module> SGML, XML and HTML parser
  154
  155This library allows you to parse SGML, XML   and HTML data into a Prolog
  156data structure. The library defines several families of predicates:
  157
  158  $ High-level predicates :
  159  Most users will only use load_html/3, load_xml/3 or load_sgml/3 to
  160  parse arbitrary input into a _DOM_ structure.  These predicates all
  161  call load_structure/3, which provides more options and may be
  162  used for processing non-standard documents.
  163
  164  The DOM structure can be used by library(xpath) to extract information
  165  from the document.
  166
  167  $ The low-level parser :
  168  The actual parser is written in C and consists of two parts: one for
  169  processing DTD (Document Type Definitions) and one for parsing data.
  170  The data can either be parsed to a Prolog (_DOM_) term or the parser
  171  can perform callbacks for the DOM _events_.
  172
  173  $ Utility predicates :
  174  Finally, this library provides prmitives for classifying characters
  175  and strings according to the XML specification such as xml_name/1 to
  176  verify whether an atom is a valid XML name (identifier).  It also
  177  provides primitives to quote attributes and CDATA elements.
  178*/
  179
  180:- multifile user:file_search_path/2.  181:- dynamic   user:file_search_path/2.  182
  183user:file_search_path(dtd, library('DTD')).
  184
  185sgml_register_catalog_file(File, Location) :-
  186    prolog_to_os_filename(File, OsFile),
  187    '_sgml_register_catalog_file'(OsFile, Location).
  188
  189:- use_foreign_library(foreign(sgml2pl)).  190
  191register_catalog(Base) :-
  192    absolute_file_name(dtd(Base), SocFile,
  193                       [ extensions([soc]),
  194                         access(read),
  195                         file_errors(fail)
  196                       ]),
  197    sgml_register_catalog_file(SocFile, end).
  198
  199:- initialization
  200    ignore(register_catalog('HTML4')).  201
  202
  203                 /*******************************
  204                 *         DTD HANDLING         *
  205                 *******************************/
  206
  207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  208Note that concurrent access to DTD objects  is not allowed, and hence we
  209will allocate and destroy them in each   thread.  Possibibly it would be
  210nicer to find out why  concurrent  access   to  DTD's  is  flawed. It is
  211diagnosed to mess with the entity resolution by Fabien Todescato.
  212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  213
  214:- thread_local
  215    current_dtd/2.  216:- volatile
  217    current_dtd/2.  218:- thread_local
  219    registered_cleanup/0.  220:- volatile
  221    registered_cleanup/0.  222
  223:- multifile
  224    dtd_alias/2.  225
  226:- create_prolog_flag(html_dialect, html5, [type(atom)]).  227
  228dtd_alias(html4, 'HTML4').
  229dtd_alias(html5, 'HTML5').
  230dtd_alias(html,  DTD) :-
  231    current_prolog_flag(html_dialect, Dialect),
  232    dtd_alias(Dialect, DTD).
  233
  234%!  dtd(+Type, -DTD) is det.
  235%
  236%   DTD is a DTD object created from  the file dtd(Type). Loaded DTD
  237%   objects are cached. Note that  DTD   objects  may  not be shared
  238%   between threads. Therefore, dtd/2  maintains   the  pool  of DTD
  239%   objects  using  a  thread_local  predicate.    DTD  objects  are
  240%   destroyed if a thread terminates.
  241%
  242%   @error existence_error(source_sink, dtd(Type))
  243
  244dtd(Type, DTD) :-
  245    current_dtd(Type, DTD),
  246    !.
  247dtd(Type, DTD) :-
  248    new_dtd(Type, DTD),
  249    (   dtd_alias(Type, Base)
  250    ->  true
  251    ;   Base = Type
  252    ),
  253    absolute_file_name(dtd(Base),
  254                       [ extensions([dtd]),
  255                         access(read)
  256                       ], DtdFile),
  257    load_dtd(DTD, DtdFile),
  258    register_cleanup,
  259    asserta(current_dtd(Type, DTD)).
  260
  261%!  load_dtd(+DTD, +DtdFile, +Options)
  262%
  263%   Load DtdFile into a DTD.  Defined options are:
  264%
  265%           * dialect(+Dialect)
  266%           Dialect to use (xml, xmlns, sgml)
  267%
  268%           * encoding(+Encoding)
  269%           Encoding of DTD file
  270%
  271%   @param  DTD is a fresh DTD object, normally created using
  272%           new_dtd/1.
  273
  274load_dtd(DTD, DtdFile) :-
  275    load_dtd(DTD, DtdFile, []).
  276load_dtd(DTD, DtdFile, Options) :-
  277    sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
  278    setup_call_cleanup(
  279        open_dtd(DTD, DTDOptions, DtdOut),
  280        setup_call_cleanup(
  281            open(DtdFile, read, DtdIn, OpenOptions),
  282            copy_stream_data(DtdIn, DtdOut),
  283            close(DtdIn)),
  284        close(DtdOut)).
  285
  286%!  destroy_dtds
  287%
  288%   Destroy  DTDs  cached  by  this  thread   as  they  will  become
  289%   unreachable anyway.
  290
  291destroy_dtds :-
  292    (   current_dtd(_Type, DTD),
  293        free_dtd(DTD),
  294        fail
  295    ;   true
  296    ).
  297
  298%!  register_cleanup
  299%
  300%   Register cleanup of DTDs created for this thread.
  301
  302register_cleanup :-
  303    registered_cleanup,
  304    !.
  305register_cleanup :-
  306    (   thread_self(main)
  307    ->  at_halt(destroy_dtds)
  308    ;   current_prolog_flag(threads, true)
  309    ->  prolog_listen(this_thread_exit, destroy_dtds)
  310    ;   true
  311    ),
  312    assert(registered_cleanup).
  313
  314
  315                 /*******************************
  316                 *          EXAMINE DTD         *
  317                 *******************************/
  318
  319prop(doctype(_), _).
  320prop(elements(_), _).
  321prop(entities(_), _).
  322prop(notations(_), _).
  323prop(entity(E, _), DTD) :-
  324    (   nonvar(E)
  325    ->  true
  326    ;   '$dtd_property'(DTD, entities(EL)),
  327        member(E, EL)
  328    ).
  329prop(element(E, _, _), DTD) :-
  330    (   nonvar(E)
  331    ->  true
  332    ;   '$dtd_property'(DTD, elements(EL)),
  333        member(E, EL)
  334    ).
  335prop(attributes(E, _), DTD) :-
  336    (   nonvar(E)
  337    ->  true
  338    ;   '$dtd_property'(DTD, elements(EL)),
  339        member(E, EL)
  340    ).
  341prop(attribute(E, A, _, _), DTD) :-
  342    (   nonvar(E)
  343    ->  true
  344    ;   '$dtd_property'(DTD, elements(EL)),
  345        member(E, EL)
  346    ),
  347    (   nonvar(A)
  348    ->  true
  349    ;   '$dtd_property'(DTD, attributes(E, AL)),
  350        member(A, AL)
  351    ).
  352prop(notation(N, _), DTD) :-
  353    (   nonvar(N)
  354    ->  true
  355    ;   '$dtd_property'(DTD, notations(NL)),
  356        member(N, NL)
  357    ).
  358
  359dtd_property(DTD, Prop) :-
  360    prop(Prop, DTD),
  361    '$dtd_property'(DTD, Prop).
  362
  363
  364                 /*******************************
  365                 *             SGML             *
  366                 *******************************/
  367
  368%!  load_structure(+Source, -ListOfContent, :Options) is det.
  369%
  370%   Parse   Source   and   return   the   resulting   structure   in
  371%   ListOfContent. Source is handed to  open_any/5, which allows for
  372%   processing an extensible set of input sources.
  373%
  374%   A proper XML document contains only   a  single toplevel element
  375%   whose name matches the document type.   Nevertheless,  a list is
  376%   returned for consistency with  the   representation  of  element
  377%   content.
  378%
  379%   The  encoding(+Encoding)  option   is    treated   special   for
  380%   compatibility reasons:
  381%
  382%     - If `Encoding` is one of =iso-8859-1=, =us-ascii= or =utf-8=,
  383%       the stream is opened in binary mode and the option is passed
  384%       to the SGML parser.
  385%     - If `Encoding` is present, but not one of the above, the
  386%       stream is opened in text mode using the given encoding.
  387%     - Otherwise (no `Encoding`), the stream is opened in binary
  388%       mode and doing the correct decoding is left to the parser.
  389
  390load_structure(Spec, DOM, Options) :-
  391    sgml_open_options(Options, OpenOptions, SGMLOptions),
  392    setup_call_cleanup(
  393        open_any(Spec, read, In, Close, OpenOptions),
  394        load_structure_from_stream(In, DOM, SGMLOptions),
  395        close_any(Close)).
  396
  397sgml_open_options(Options, OpenOptions, SGMLOptions) :-
  398    Options = M:Plain,
  399    (   select_option(encoding(Encoding), Plain, NoEnc)
  400    ->  (   sgml_encoding(Encoding)
  401        ->  merge_options(NoEnc, [type(binary)], OpenOptions),
  402            SGMLOptions = Options
  403        ;   OpenOptions = Plain,
  404            SGMLOptions = M:NoEnc
  405        )
  406    ;   merge_options(Plain, [type(binary)], OpenOptions),
  407        SGMLOptions = Options
  408    ).
  409
  410sgml_encoding(Enc) :-
  411    downcase_atom(Enc, Enc1),
  412    sgml_encoding_l(Enc1).
  413
  414sgml_encoding_l('iso-8859-1').
  415sgml_encoding_l('us-ascii').
  416sgml_encoding_l('utf-8').
  417sgml_encoding_l('utf8').
  418sgml_encoding_l('iso_latin_1').
  419sgml_encoding_l('ascii').
  420
  421load_structure_from_stream(In, Term, M:Options) :-
  422    (   select_option(dtd(DTD), Options, Options1)
  423    ->  ExplicitDTD = true
  424    ;   ExplicitDTD = false,
  425        Options1 = Options
  426    ),
  427    move_front(Options1, dialect(_), Options2), % dialect sets defaults
  428    setup_call_cleanup(
  429        new_sgml_parser(Parser,
  430                        [ dtd(DTD)
  431                        ]),
  432        parse(Parser, M:Options2, TermRead, In),
  433        free_sgml_parser(Parser)),
  434    (   ExplicitDTD == true
  435    ->  (   DTD = dtd(_, DocType),
  436            dtd_property(DTD, doctype(DocType))
  437        ->  true
  438        ;   true
  439        )
  440    ;   free_dtd(DTD)
  441    ),
  442    Term = TermRead.
  443
  444move_front(Options0, Opt, Options) :-
  445    selectchk(Opt, Options0, Options1),
  446    !,
  447    Options = [Opt|Options1].
  448move_front(Options, _, Options).
  449
  450
  451parse(Parser, M:Options, Document, In) :-
  452    set_parser_options(Options, Parser, In, Options1),
  453    parser_meta_options(Options1, M, Options2),
  454    set_input_location(Parser, In),
  455    sgml_parse(Parser,
  456               [ document(Document),
  457                 source(In)
  458               | Options2
  459               ]).
  460
  461set_parser_options([], _, _, []).
  462set_parser_options([H|T], Parser, In, Rest) :-
  463    (   set_parser_option(H, Parser, In)
  464    ->  set_parser_options(T, Parser, In, Rest)
  465    ;   Rest = [H|R2],
  466        set_parser_options(T, Parser, In, R2)
  467    ).
  468
  469set_parser_option(Var, _Parser, _In) :-
  470    var(Var),
  471    !,
  472    instantiation_error(Var).
  473set_parser_option(Option, Parser, _) :-
  474    def_entity(Option, Parser),
  475    !.
  476set_parser_option(offset(Offset), _Parser, In) :-
  477    !,
  478    seek(In, Offset, bof, _).
  479set_parser_option(Option, Parser, _In) :-
  480    parser_option(Option),
  481    !,
  482    set_sgml_parser(Parser, Option).
  483set_parser_option(Name=Value, Parser, In) :-
  484    Option =.. [Name,Value],
  485    set_parser_option(Option, Parser, In).
  486
  487
  488parser_option(dialect(_)).
  489parser_option(shorttag(_)).
  490parser_option(case_sensitive_attributes(_)).
  491parser_option(case_preserving_attributes(_)).
  492parser_option(system_entities(_)).
  493parser_option(max_memory(_)).
  494parser_option(ignore_doctype(_)).
  495parser_option(file(_)).
  496parser_option(line(_)).
  497parser_option(space(_)).
  498parser_option(number(_)).
  499parser_option(defaults(_)).
  500parser_option(doctype(_)).
  501parser_option(qualify_attributes(_)).
  502parser_option(encoding(_)).
  503parser_option(keep_prefix(_)).
  504
  505
  506def_entity(entity(Name, Value), Parser) :-
  507    get_sgml_parser(Parser, dtd(DTD)),
  508    xml_quote_attribute(Value, QValue),
  509    setup_call_cleanup(open_dtd(DTD, [], Stream),
  510                       format(Stream, '<!ENTITY ~w "~w">~n',
  511                              [Name, QValue]),
  512                       close(Stream)).
  513def_entity(xmlns(URI), Parser) :-
  514    set_sgml_parser(Parser, xmlns(URI)).
  515def_entity(xmlns(NS, URI), Parser) :-
  516    set_sgml_parser(Parser, xmlns(NS, URI)).
  517
  518%!  parser_meta_options(+Options0, +Module, -Options)
  519%
  520%   Qualify meta-calling options to the parser.
  521
  522parser_meta_options([], _, []).
  523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
  524    !,
  525    parser_meta_options(T0, M, T).
  526parser_meta_options([H|T0], M, [H|T]) :-
  527    parser_meta_options(T0, M, T).
  528
  529
  530%!  set_input_location(+Parser, +In:stream) is det.
  531%
  532%   Set the input location if this was not set explicitly
  533
  534set_input_location(Parser, _In) :-
  535    get_sgml_parser(Parser, file(_)),
  536    !.
  537set_input_location(Parser, In) :-
  538    stream_property(In, file_name(File)),
  539    !,
  540    set_sgml_parser(Parser, file(File)),
  541    stream_property(In, position(Pos)),
  542    set_sgml_parser(Parser, position(Pos)).
  543set_input_location(_, _).
  544
  545                 /*******************************
  546                 *           UTILITIES          *
  547                 *******************************/
  548
  549%!  load_sgml_file(+File, -DOM) is det.
  550%
  551%   Load SGML from File and unify   the resulting DOM structure with
  552%   DOM.
  553%
  554%   @deprecated     New code should use load_sgml/3.
  555
  556load_sgml_file(File, Term) :-
  557    load_sgml(File, Term, []).
  558
  559%!  load_xml_file(+File, -DOM) is det.
  560%
  561%   Load XML from File and unify   the  resulting DOM structure with
  562%   DOM.
  563%
  564%   @deprecated     New code should use load_xml/3.
  565
  566load_xml_file(File, Term) :-
  567    load_xml(File, Term, []).
  568
  569%!  load_html_file(+File, -DOM) is det.
  570%
  571%   Load HTML from File and unify   the resulting DOM structure with
  572%   DOM.
  573%
  574%   @deprecated     New code should use load_html/3.
  575
  576load_html_file(File, DOM) :-
  577    load_html(File, DOM, []).
  578
  579%!  load_html(+Input, -DOM, +Options) is det.
  580%
  581%   Load HTML text from Input and  unify the resulting DOM structure
  582%   with DOM. Options are passed   to load_structure/3, after adding
  583%   the following default options:
  584%
  585%     - dtd(DTD)
  586%     Pass the DTD for HTML as obtained using dtd(html, DTD).
  587%     - dialect(Dialect)
  588%     Current dialect from the Prolog flag =html_dialect=
  589%     - max_errors(-1)
  590%     - syntax_errors(quiet)
  591%     Most HTML encountered in the wild contains errors. Even in the
  592%     context of errors, the resulting DOM term is often a
  593%     reasonable guess at the intent of the author.
  594%
  595%   You may also want to use  the library(http/http_open) to support
  596%   loading from HTTP and HTTPS URLs. For example:
  597%
  598%   ==
  599%   :- use_module(library(http/http_open)).
  600%   :- use_module(library(sgml)).
  601%
  602%   load_html_url(URL, DOM) :-
  603%       load_html(URL, DOM, []).
  604%   ==
  605
  606load_html(File, Term, M:Options) :-
  607    current_prolog_flag(html_dialect, Dialect),
  608    dtd(Dialect, DTD),
  609    merge_options(Options,
  610                  [ dtd(DTD),
  611                    dialect(Dialect),
  612                    max_errors(-1),
  613                    syntax_errors(quiet)
  614                  ], Options1),
  615    load_structure(File, Term, M:Options1).
  616
  617%!  load_xml(+Input, -DOM, +Options) is det.
  618%
  619%   Load XML text from Input and   unify the resulting DOM structure
  620%   with DOM. Options are passed   to load_structure/3, after adding
  621%   the following default options:
  622%
  623%     - dialect(xml)
  624
  625load_xml(Input, DOM, M:Options) :-
  626    merge_options(Options,
  627                  [ dialect(xml)
  628                  ], Options1),
  629    load_structure(Input, DOM, M:Options1).
  630
  631%!  load_sgml(+Input, -DOM, +Options) is det.
  632%
  633%   Load SGML text from Input and  unify the resulting DOM structure
  634%   with DOM. Options are passed   to load_structure/3, after adding
  635%   the following default options:
  636%
  637%     - dialect(sgml)
  638
  639load_sgml(Input, DOM, M:Options) :-
  640    merge_options(Options,
  641                  [ dialect(sgml)
  642                  ], Options1),
  643    load_structure(Input, DOM, M:Options1).
  644
  645
  646
  647                 /*******************************
  648                 *            ENCODING          *
  649                 *******************************/
  650
  651%!  xml_quote_attribute(+In, -Quoted) is det.
  652%!  xml_quote_cdata(+In, -Quoted) is det.
  653%
  654%   Backward  compatibility  for  versions  that  allow  to  specify
  655%   encoding. All characters that cannot fit the encoding are mapped
  656%   to XML character entities (&#dd;).  Using   ASCII  is the safest
  657%   value.
  658
  659xml_quote_attribute(In, Quoted) :-
  660    xml_quote_attribute(In, Quoted, ascii).
  661
  662xml_quote_cdata(In, Quoted) :-
  663    xml_quote_cdata(In, Quoted, ascii).
  664
  665%!  xml_name(+Atom) is semidet.
  666%
  667%   True if Atom is a valid XML name.
  668
  669xml_name(In) :-
  670    xml_name(In, ascii).
  671
  672
  673                 /*******************************
  674                 *    XML CHARACTER CLASSES     *
  675                 *******************************/
  676
  677%!  xml_basechar(+CodeOrChar) is semidet.
  678%!  xml_ideographic(+CodeOrChar) is semidet.
  679%!  xml_combining_char(+CodeOrChar) is semidet.
  680%!  xml_digit(+CodeOrChar) is semidet.
  681%!  xml_extender(+CodeOrChar) is semidet.
  682%
  683%   XML  character  classification   predicates.    Each   of  these
  684%   predicates accept both a character   (one-character  atom) and a
  685%   code (integer).
  686%
  687%   @see http://www.w3.org/TR/2006/REC-xml-20060816
  688
  689
  690                 /*******************************
  691                 *         TYPE CHECKING        *
  692                 *******************************/
  693
  694%!  xml_is_dom(@Term) is semidet.
  695%
  696%   True  if  term  statisfies   the    structure   as  returned  by
  697%   load_structure/3 and friends.
  698
  699xml_is_dom(0) :- !, fail.               % catch variables
  700xml_is_dom(List) :-
  701    is_list(List),
  702    !,
  703    xml_is_content_list(List).
  704xml_is_dom(Term) :-
  705    xml_is_element(Term).
  706
  707xml_is_content_list([]).
  708xml_is_content_list([H|T]) :-
  709    xml_is_content(H),
  710    xml_is_content_list(T).
  711
  712xml_is_content(0) :- !, fail.
  713xml_is_content(pi(Pi)) :-
  714    !,
  715    atom(Pi).
  716xml_is_content(CDATA) :-
  717    atom(CDATA),
  718    !.
  719xml_is_content(CDATA) :-
  720    string(CDATA),
  721    !.
  722xml_is_content(Term) :-
  723    xml_is_element(Term).
  724
  725xml_is_element(element(Name, Attributes, Content)) :-
  726    dom_name(Name),
  727    dom_attributes(Attributes),
  728    xml_is_content_list(Content).
  729
  730dom_name(NS:Local) :-
  731    atom(NS),
  732    atom(Local),
  733    !.
  734dom_name(Local) :-
  735    atom(Local).
  736
  737dom_attributes(0) :- !, fail.
  738dom_attributes([]).
  739dom_attributes([H|T]) :-
  740    dom_attribute(H),
  741    dom_attributes(T).
  742
  743dom_attribute(Name=Value) :-
  744    dom_name(Name),
  745    atomic(Value).
  746
  747
  748                 /*******************************
  749                 *            MESSAGES          *
  750                 *******************************/
  751:- multifile
  752    prolog:message/3.  753
  754%       Catch messages.  sgml/4 is generated by the SGML2PL binding.
  755
  756prolog:message(sgml(Parser, File, Line, Message)) -->
  757    { get_sgml_parser(Parser, dialect(Dialect))
  758    },
  759    [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
  760
  761
  762                 /*******************************
  763                 *         XREF SUPPORT         *
  764                 *******************************/
  765
  766:- multifile
  767    prolog:called_by/2.  768
  769prolog:called_by(sgml_parse(_, Options), Called) :-
  770    findall(Meta, meta_call_term(_, Meta, Options), Called).
  771
  772meta_call_term(T, G+N, Options) :-
  773    T = call(Event, G),
  774    pmember(T, Options),
  775    call_params(Event, Term),
  776    functor(Term, _, N).
  777
  778pmember(X, List) :-                     % member for partial lists
  779    nonvar(List),
  780    List = [H|T],
  781    (   X = H
  782    ;   pmember(X, T)
  783    ).
  784
  785call_params(begin, begin(tag,attributes,parser)).
  786call_params(end,   end(tag,parser)).
  787call_params(cdata, cdata(cdata,parser)).
  788call_params(pi,    pi(cdata,parser)).
  789call_params(decl,  decl(cdata,parser)).
  790call_params(error, error(severity,message,parser)).
  791call_params(xmlns, xmlns(namespace,url,parser)).
  792call_params(urlns, urlns(url,url,parser)).
  793
  794                 /*******************************
  795                 *           SANDBOX            *
  796                 *******************************/
  797
  798:- multifile
  799    sandbox:safe_primitive/1,
  800    sandbox:safe_meta_predicate/1.  801
  802sandbox:safe_meta_predicate(sgml:load_structure/3).
  803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
  804    dtd_alias(Dialect, _).
  805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
  806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
  807sandbox:safe_primitive(sgml:xml_name(_,_)).
  808sandbox:safe_primitive(sgml:xml_basechar(_)).
  809sandbox:safe_primitive(sgml:xml_ideographic(_)).
  810sandbox:safe_primitive(sgml:xml_combining_char(_)).
  811sandbox:safe_primitive(sgml:xml_digit(_)).
  812sandbox:safe_primitive(sgml:xml_extender(_)).
  813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
  814sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
  815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))