View source with formatted comments or as raw
    1/*  Part of SWISH
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@cs.vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2017, VU University Amsterdam
    7			 CWI Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(swish_data_source,
   37          [ data_source/2,              % :Id, +Source
   38            data_record/2,              % :Id, -Record
   39            record/2,                   % :Id, -Record
   40            data_property/2,            % :Id, ?Property
   41            data_row/2,                 % :Id, -Row
   42            data_row/4,                 % :Id, +Range, +Header, -Row
   43            data_dump/3,                % :Id, +Range, -Row
   44
   45            data_flush/1,               % +Hash
   46            'data assert'/1,            % +Term
   47            'data materialized'/3,	% +Hash, +Signature, +SourceID
   48            'data failed'/2		% +Hash, +Signature
   49          ]).   50:- use_module(library(error)).   51:- use_module(library(lists)).   52:- use_module(library(settings)).   53:- use_module(library(solution_sequences)).   54:- use_module(library(pengines)).   55
   56:- setting(max_memory, integer, 8000,
   57           "Max memory used for cached data store (Mb)").   58
   59
   60/** <module> Cached data access
   61
   62This module provides access to external data   by caching it as a Prolog
   63predicate. The data itself is kept in  a   global  data module, so it is
   64maintained over a SWISH Pengine invocation.
   65*/
   66
   67:- meta_predicate
   68    data_source(:, +),
   69    data_record(:, -),
   70    record(:, -),
   71    data_row(:, -),
   72    data_row(:, +, +, -),
   73    data_dump(:, +, -),
   74    data_property(:, -).   75
   76:- multifile
   77    source/2.                           % +Term, -Goal
   78
   79
   80		 /*******************************
   81		 *          ADMIN DATA		*
   82		 *******************************/
   83
   84:- dynamic
   85    data_source_db/3,                   % Hash, Goal, Lock
   86    data_signature_db/2,                % Hash, Signature
   87    data_materialized/5,                % Hash, Materialized, SourceID, CPU, Wall
   88    data_last_access/3.                 % Hash, Time, Updates
   89
   90'data assert'(Term) :-
   91    assertz(Term).
   92
   93%!  'data materialized'(+Hash, +Signature, +SourceVersion) is det.
   94%
   95%   Called by a data plugin  to  indicate   that  loading  the  data has
   96%   finished.
   97%
   98%   @arg Hash is the has of the original data source
   99%   @arg Signature is a term Hash(Arg1, Arg2, ...), where `Arg1`, ...
  100%   are atoms or small integers that indicate the field names.
  101%   @arg SourceVersion is a term that indicates the identity of the source.
  102%   this is typically a dict containing e.g., a time stamp, content
  103%   hash, HTTP =Etag= value, etc.
  104
  105'data materialized'(Hash, Signature, SourceVersion) :-
  106    statistics(cputime, CPU1),
  107    get_time(Now),
  108    nb_current('$data_source_materalize', stats(Time0, CPU0)),
  109    CPU  is CPU1 - CPU0,
  110    Wall is Now - Time0,
  111    assertz(data_signature_db(Hash, Signature)),
  112    assertz(data_materialized(Hash, Now, SourceVersion, CPU, Wall)).
  113
  114'data failed'(_Hash, Signature) :-
  115    functor(Signature, Name, Arity),
  116    functor(Generic, Name, Arity),
  117    retractall(Generic).
  118
  119%!  data_source(:Id, +Source) is det.
  120%
  121%   Create a data source Id from   the  source definition Source. Source
  122%   definitions are plugin files loaded from swish(data).
  123
  124data_source(M:Id, Source) :-
  125    variant_sha1(Source, Hash),
  126    data_source_db(Hash, Source, _),
  127    !,
  128    (   clause(M:'$data'(Id, Hash), true)
  129    ->  true
  130    ;   assertz(M:'$data'(Id, Hash))
  131    ).
  132data_source(M:Id, Source) :-
  133    valid_source(Source),
  134    variant_sha1(Source, Hash),
  135    mutex_create(Lock),
  136    assertz(data_source_db(Hash, Source, Lock)),
  137    assertz(M:'$data'(Id, Hash)).
  138
  139%!  record(:Id, -Record) is nondet.
  140%!  data_record(:Id, -Record) is nondet.
  141%
  142%   True when Record is  a  dict  representing   a  row  in  the dataset
  143%   identified by Id.
  144%
  145%   @deprecated  record/2  is   deprecated.   New    code   should   use
  146%   data_record/2.
  147
  148record(Id, Record) :-
  149    data_record(Id, Record).
  150
  151data_record(M:Id, Record) :-
  152    data_hash(M:Id, Hash),
  153    materialize(Hash),
  154    data_signature_db(Hash, Signature),
  155    data_record(Signature, Id, Record, Head),
  156    call(Head).
  157
  158data_record(Signature, Tag, Record, Head) :-
  159    Signature =.. [Name|Keys],
  160    pairs_keys_values(Pairs, Keys, Values),
  161    dict_pairs(Record, Tag, Pairs),
  162    Head =.. [Name|Values].
  163
  164data_hash(M:Id, Hash) :-
  165    clause(M:'$data'(Id, Hash), true),
  166    !.
  167data_hash(_:Id, _) :-
  168    existence_error(dataset, Id).
  169
  170%!  data_row(:Id, -Row) is nondet.
  171%!  data_row(:Id, +Range, +Header, -Row) is nondet.
  172%
  173%   True when Row is a term Id(Arg,   ...), where the first row contains
  174%   the column names.
  175%
  176%   @arg Header If `true`, include a header row.
  177%   @see data_dump/3 to return a table and for a description of Range.
  178
  179data_row(Id, Row) :-
  180    data_row(Id, all, true, Row).
  181
  182data_row(M:Id, Range, Header, Row) :-
  183    must_be(boolean, Header),
  184    data_hash(M:Id, Hash),
  185    materialize(Hash),
  186    data_signature_db(Hash, Signature),
  187    Signature =.. [_|ColNames],
  188    same_length(ColNames, Vars),
  189    Goal =.. [Hash|Vars],
  190    Row  =.. [Id|Vars],
  191    (   Header == true,
  192        Vars = ColNames
  193    ;   range(Range, M:Id, Goal)
  194    ).
  195
  196range(all, _Id, Goal) :-
  197    !,
  198    call(Goal).
  199range(From-To, _Id, Goal) :-
  200    !,
  201    Skip is From - 1,
  202    Size is To-Skip,
  203    limit(Size, offset(Skip, call(Goal))).
  204range(Limit, _Id, Goal) :-
  205    Limit >= 0,
  206    !,
  207    limit(Limit, call(Goal)).
  208range(Limit, Id, Goal) :-
  209    Limit < 0,
  210    data_property(Id, rows(Rows)),
  211    Skip is Rows+Limit,
  212    offset(Skip, call(Goal)).
  213
  214%!  data_dump(:Id, +Range, -Table) is det.
  215%
  216%   Table is a list of rows in the indicated range. This cooperates with
  217%   the table rendering to produce a data table.  Range is one of:
  218%
  219%     - all
  220%       All rows from the data are included.  Be careful if these
  221%       are many as it is likely to make your browser very slow.
  222%     - From-To
  223%       List the (1-based) rows From to To
  224%     - Count
  225%       If Count >= 0, list the _first_, else list the _last_
  226%       Count rows.
  227
  228data_dump(Id, Range, Table) :-
  229    findall(Row, data_row(Id, Range, true, Row), Table).
  230
  231
  232%!  data_property(:Id, ?Property) is nondet.
  233%
  234%   True when Property is a known property about the data source Id.
  235%   Defined properties are:
  236%
  237%     - columns(-Count)
  238%       Number of columns in the table.
  239%     - column_names(-Names)
  240%       Names is a list of the column names as they appear in the
  241%       data.
  242%     - rows(-Rows)
  243%       Number of rows in the table
  244%     - hash(-Hash)
  245%       Get the internal (hashed) identifier for the data source
  246%     - source_version(-SourceVersion)
  247%       A term (often a dict) that provides version information
  248%       about the source.  Details depend on the source.
  249%     - materialized(-TimeStamp)
  250%       The data source was materialized at TimeStamp.
  251%     - source(-Term)
  252%       Description of the original source term used to declare
  253%       the data source
  254
  255data_property(M:Id, Property) :-
  256    data_hash(M:Id, Hash),
  257    materialize(Hash),
  258    property(Property),
  259    property(Property, Hash).
  260
  261property(columns(_)).
  262property(column_names(_)).
  263property(rows(_)).
  264property(hash(_)).
  265property(source_version(_)).
  266property(materialized(_)).
  267property(source(_)).
  268
  269property(columns(Count), Hash) :-
  270    data_signature_db(Hash, Signature),
  271    functor(Signature, _, Count).
  272property(column_names(Names), Hash) :-
  273    data_signature_db(Hash, Signature),
  274    Signature =.. [_|Names].
  275property(rows(Count), Hash) :-
  276    data_signature_db(Hash, Signature),
  277    predicate_property(Signature, number_of_clauses(Count)).
  278property(hash(Hash), Hash).
  279property(source_version(SourceVersion), Hash) :-
  280    data_materialized(Hash, _, SourceVersion, _, _).
  281property(materialized(TimeStamp), Hash) :-
  282    data_materialized(Hash, TimeStamp, _, _, _).
  283property(source(SourceTerm), Hash) :-
  284    data_source_db(Hash, SourceTerm, _Lock).
  285
  286%!  swish:goal_expansion(+Dict, -DataGoal)
  287%
  288%   Translate a Dict where the tag is   the  identifier of a data source
  289%   and the keys are columns pf this  source   into  a goal on the data.
  290%   Note that the data itself  is   represented  as  a Prolog predicate,
  291%   representing each row as a fact and each column as an argument.
  292
  293:- multifile
  294    swish:goal_expansion/2.  295
  296swish:goal_expansion(Dict, swish_data_source:Head) :-
  297    is_dict(Dict, Id),
  298    prolog_load_context(module, M),
  299    clause(M:'$data'(Id, Hash), true),
  300    materialize(Hash),
  301    data_signature_db(Hash, Signature),
  302    data_record(Signature, Id, Record, Head),
  303    Dict :< Record.
  304
  305
  306		 /*******************************
  307		 *       DATA MANAGEMENT	*
  308		 *******************************/
  309
  310valid_source(Source) :-
  311    must_be(nonvar, Source),
  312    source(Source, _Goal),
  313    !.
  314valid_source(Source) :-
  315    existence_error(data_source, Source).
  316
  317%!  materialize(+Hash)
  318%
  319%   Materialise the data identified by   Hash.  The materialization goal
  320%   should
  321%
  322%     - Call 'data assert'/1 using a term Hash(Arg, ...) for each term
  323%       to add to the database.
  324%     - Call 'data materialized'(Hash, Signature, SourceVersion) on
  325%       completion, where `Signature` is a term Hash(ArgName, ...) and
  326%       `SourceVersion` indicates the version info provided by the
  327%       source.  Use `-` if this information is not available.
  328%     - OR call `data failed`(+Hash, +Signature) if materialization
  329%       fails after some data has been asserted.
  330
  331materialize(Hash) :-
  332    must_be(atom, Hash),
  333    data_materialized(Hash, _When, _From, _CPU, _Wall),
  334    !,
  335    update_last_access(Hash).
  336materialize(Hash) :-
  337    data_source_db(Hash, Source, Lock),
  338    update_last_access(Hash),
  339    gc_data,
  340    with_mutex(Lock, materialize_sync(Hash, Source)).
  341
  342materialize_sync(Hash, _Source) :-
  343    data_materialized(Hash, _When, _From, _CPU, _Wall),
  344    !.
  345materialize_sync(Hash, Source) :-
  346    source(Source, Goal),
  347    get_time(Time0),
  348    statistics(cputime, CPU0),
  349    setup_call_cleanup(
  350        b_setval('$data_source_materalize', stats(Time0, CPU0)),
  351        call(Goal, Hash),
  352        nb_delete('$data_source_materalize')),
  353    data_signature_db(Hash, Head),
  354    functor(Head, Name, Arity),
  355    public(Name/Arity).
  356
  357
  358		 /*******************************
  359		 *              GC		*
  360		 *******************************/
  361
  362%!  update_last_access(+Hash) is det.
  363%
  364%   Update the last known access time. The   value  is rounded down to 1
  365%   minute to reduce database updates.
  366
  367update_last_access(Hash) :-
  368    get_time(Now),
  369    Rounded is floor(Now/60)*60,
  370    (   data_last_access(Hash, Rounded, _)
  371    ->  true
  372    ;   clause(data_last_access(Hash, _, C0), true, Old)
  373    ->  C is C0+1,
  374        asserta(data_last_access(Hash, Rounded, C)),
  375        erase(Old)
  376    ;   asserta(data_last_access(Hash, Rounded, 1))
  377    ).
  378
  379gc_stats(Hash, _{ hash:Hash,
  380                  materialized:When, cpu:CPU, wall:Wall,
  381                  bytes:Size,
  382                  last_accessed_ago:Ago,
  383                  access_frequency:AccessCount
  384                }) :-
  385    data_materialized(Hash, When, _From, CPU, Wall),
  386    data_signature_db(Hash, Signature),
  387    data_last_access(Hash, Last, AccessCount),
  388    get_time(Now),
  389    Ago is floor(Now/60)*60-Last,
  390    predicate_property(Signature, number_of_clauses(Count)),
  391    functor(Signature, _, Arity),
  392    Size is (88+(16*Arity))*Count.
  393
  394
  395%!  gc_data is det.
  396%!  gc_data(+MaxSize) is det.
  397%
  398%   Remove the last unused data set until   memory  of this module drops
  399%   below  MaxSize.  The   predicate   gc_data/0    is   called   before
  400%   materializing a data source.
  401
  402gc_data :-
  403    setting(max_memory, MB),
  404    Bytes is MB*1024*1024,
  405    gc_data(Bytes),
  406    set_module(program_space(Bytes)).
  407
  408gc_data(MaxSize) :-
  409    module_property(swish_data_source, program_size(Size)),
  410    Size < MaxSize,
  411    !.
  412gc_data(MaxSize) :-
  413    findall(Stat, gc_stats(_, Stat), Stats),
  414    sort(last_accessed_ago, >=, Stats, ByTime),
  415    member(Stat, ByTime),
  416       data_flush(ByTime.hash),
  417       module_property(swish_data_source, program_size(Size)),
  418       Size < MaxSize,
  419    !.
  420gc_data(_).
  421
  422
  423%!  data_flush(+Hash)
  424%
  425%   Drop the data associated with hash
  426
  427data_flush(Hash) :-
  428    data_signature_db(Hash, Signature),
  429    data_record(Signature, _Id, _Record, Head),
  430    retractall(Head),
  431    retractall(data_signature_db(Hash, Head)),
  432    retractall(data_materialized(Hash, _When1, _From, _CPU, _Wall)),
  433    retractall(data_last_access(Hash, _When2, _Count)).
  434
  435
  436		 /*******************************
  437		 *            SANDBOX		*
  438		 *******************************/
  439
  440:- multifile
  441    sandbox:safe_meta/2.  442
  443sandbox:safe_meta(swish_data_source:data_source(Id,_), [])     :- safe_id(Id).
  444sandbox:safe_meta(swish_data_source:data_record(Id,_), [])     :- safe_id(Id).
  445sandbox:safe_meta(swish_data_source:record(Id,_), [])          :- safe_id(Id).
  446sandbox:safe_meta(swish_data_source:data_row(Id,_), [])        :- safe_id(Id).
  447sandbox:safe_meta(swish_data_source:data_row(Id,_,_,_), [])    :- safe_id(Id).
  448sandbox:safe_meta(swish_data_source:data_dump(Id,_,_), [])     :- safe_id(Id).
  449sandbox:safe_meta(swish_data_source:data_property(Id,_), [])   :- safe_id(Id).
  450
  451safe_id(M:_) :- !, pengine_self(M).
  452safe_id(_)