Newer
Older
/*
This is part of datacrate, a node.js library for working with
DataCrates. Copyright (C) 2018 University of Technology Sydney
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const fs = require('fs-extra');
const _ = require('lodash');
const DEFAULTS = path.join(__dirname, "../defaults");
const CONTEXT = path.join(DEFAULTS, 'context.json');
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
const RELATED = [ 'publications', 'websites', 'metadata', 'data' ];
const RELATED_TYPES = {
'publications': 'ScholarlyArticle',
'websites': 'WebSite',
'metadata': 'DataSet', // schema.org hasn't got a dedicated type
'data': 'DataSet'
};
const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ];
const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ];
// these will do for now
const SUBJECT_IRI_PREFIX = {
'dc:subject_anzsrc:for': '_:FOR/',
'dc:subject_anzsrc:seo': '_:SEO/'
};
// dataset -> about -> subjects
// dataset -> spatialCoverage -> place
// dataset -> temporalCoverate -> time
// dataset -> funder -> organisation
// dataset -> licence -> licence (CreativeWork or URL)
// dataset -> citation -> related publications, websites, articles, datasets
// the keys here are schema.org properties on the root DataSet
// and the values are functions which take the data publication
// as an argument and return null or an array of property values
const DATASET_PROPERTIES = {
'about': make_subjects,
'spatialCoverage': make_spatial,
'temporalCoverate': make_temporal,
'funder': make_funders,
'licence': make_licence,
'citation': make_related_works
};
/* datapub2catalog(options)
Convert metadata from a redbox data publication and
its data record to a JSON-LD datacrate catalogue
options = {
id: redbox oid,
datapub: redbox data publication record,
org: {
@id: organisation identifier,
name: organisation name
},
owner: email of the data pub's record,
approver: email of the librarian who approved publication
dates: {
dateCreated: Date object or string}
datePublished: Date object or string
}
}
If there are no dates, or either dateCreated or datePublished is
undefined, now() is used as the default.
The owner and approver are used to build two Actions against
the object - creation and publication.
TODO; add contentSize, encodingFormat and fileFormat from
siegfried to the dataLocations array
Remember to keep the connection to the data record
*/
async function datapub2catalog(options) {
const id = options['id'];
const datapub = options['datapub'];
const org = options['organisation'];
const owner = options['owner'];
const approver = options['approver'];
const dates = dates_default(options['dates'])
const context = await fs.readJson(CONTEXT);
const catalog = {
'@context': context
};
const organisation = {
'@type': 'Organization',
'name': org['name']
};
if( ! (organisation['@id'] && organisation['name'] ) ) {
throw Error("Organization must have an id and a name");
}
// assumes that all creators are affiliated to this one organisation
var people = make_creators(datapub, organisation);
const files = make_files(datapub);
const dataset = make_dataset(id, datapub, organisation, dates, files);
const [ history, more_people ] = make_history(
dataset, people, owner, approver, dates
);
if( more_people.length > 0 ) {
people = people.concat(more_people);
}
const graph = _.flatten([
dataset,
organisation,
history
]);
// these are functions which try to get various DataCrate
// properties from the datapub: if they return anything,
// the results get pushed onto the graph and added to
// the dataset
_.forEach(DATASET_PROPERTIES, ( make_items, property ) => {
const items = make_items(datapub);
if( items ) {
const eitems = items.filter((i) => i);
if( eitems ) {
dataset[property] = items.map((i) => {
return { '@id': i['@id'] }
});
graph.push(eitems);
}
}
});
return {
'@context': context,
'@graph': graph.filter((e) => e)
};
function dates_default(values) {
var dates = {};
const now = (new Date()).toISOString();
[ 'dateCreated', 'datePublished' ].map((d) => {
if( values && values[d] ) {
dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d];
} else {
dates[d] = now;
}
});
return dates;
}
function link_id(item) {
return { '@id': item['@id'] };
}
// TODO: isBasedOn the data record
function make_dataset(id, datapub, organisation, dates, files) {
const ds = {
'@id': id,
'name': datapub['title'],
'description': datapub['description'],
'dateCreated': dates['dateCreated'],
'datePublished': dates['datePublished'],
'Publisher': link_id(organisation),
'hasPart': files.map(link_id),
};
return ds;
function find_by_email(people, email) {
const match = people.filter((x) => x['email'] == email);
if( match.length ) {
return match[0];
} else {
return undefined;
}
}
function make_history(dataset, people, owner, approver, dates) {
var owner_item = find_by_email(people, owner);
var approver_item = find_by_email(people, approver);
const new_people = []
if( !owner_item ) {
owner_item = {
'@id': owner,
'email': owner
};
new_people.push(owner_item)
}
if( !approver_item ) {
approver_item = {
'@id': approver,
'email': approver
};
new_people.push(approver_item)
}
// TODO: descriptions for these and maybe a link to the
// data record for the CreateAction
const history = [
{
'@id': dataset['@id'] + '_history1',
'@type': 'CreateAction',
'name': 'Create',
'endTime': dates['dateCreated'],
'object': link_id(dataset),
'agent': link_id(owner_item)
},
{
'@id': dataset['@id'] + '_history2',
'@type': 'UpdateAction',
'name': 'Publish',
'endTime': dates['datePublished'],
'object': link_id(dataset),
'agent': link_id(approver_item)
}
];
return [ history, new_people ];
}
// files(datapub)
// crosswalk dataLocations to an array of File items.
// The dataLocations are expected to have contentSize,
// encodingFormat and fileFormat already set
// This assumes that the directory is flat ie all of the files
// are in the root
function make_files(datapub) {
if( !datapub['dataLocations'] ) {
return [];
}
const files = datapub['dataLocations'].map((dl) => {
if( dl['type'] == 'attachment' ) {
return {
'@id': dl['name'],
'name': dl['name'],
'@type': 'File',
'contentSize': dl['contentSize'],
'encodingFormat': dl['encodingFormat'],
'fileFormat': dl['fileFormat']
}
} else {
// todo: URLs and physical locations
return undefined;
}
});
return files.filter((x) => x);
}
// creators(datapub)
//
// returns the array of Person items from the creators
// field of the data publication
// TODO: affiliations
function make_creators(datapub, organisation) {
if( !datapub['creators'] ) {
throw Error("A DataCrate has to have at least one creator");
}
const creators = datapub['creators'].map((p) => {
const id = p['orcid'] || p['email'] || p['text_full_name'];
'identifier': id,
'name': p['text_full_name'],
'familyName': p['family_name'],
'givenName': p['given_name'],
'email': p['email'],
'affiliation': link_id(organisation)
}
} else {
// warn for unidentifiable creators
return undefined;
const nc = creators.filter((x) => x)
return nc;
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// make related-publication etc links
// this returns an array of arrays of items, which will be _.flattened
// when the graph is populated
// note that these are very simple - just URI, title and description
// FIXME this assumes that there's a URI which can be used as the
// @id
function make_related_works(datapub) {
return RELATED.map((thing) => {
const field = 'related_' + thing;
return datapub[field].map((item) => {
const ld_item = {
'@id': item['related_url'],
'@type': RELATED_TYPES[thing],
'identifier': item['related_url'],
'name': item['related_title'],
};
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
return ld_item;
})
})
}
// Both grants (as projects) and funding bodies are represented
// in DataCrates as Organizations with the relationship 'funded'
// TODO - can we do chaining as per the spec?
// TODO - bring the project in from the RDMP?
function make_funders(datapub) {
var f = [];
return FUNDING_FIELDS.map((field) => {
if( datapub[field] ) {
return datapub[field].map((forg) => {
return {
'@id': forg['dc_identifier'],
'identifier': forg['dc_identifier'],
'@type': 'Organization',
'name': forg['dc_title']
}
});
} else {
return [];
}
});
function make_subjects(datapub) {
return SUBJECT_FIELDS.map((field) => {
if( datapub[field] ) {
return datapub[field].map((subj) => {
const id = SUBJECT_IRI_PREFIX[field] + subj;
return {
'@id': id,
'identifier': id,
'name': subj
};
})
}
});
// spatialCoverage
function make_spatial(datapub) {
var items = [];
var i = 1;
if( datapub['geolocations'] ) {
items = datapub['geolocations'].map((gl) => {
const id = '_:spatial/' + String(i);
i += 1;
return {
'@id': id,
'identifier': id,
'description': gl
}
});
}
if( datapub['geospatial'] ) {
// FIXME - need to see what this looks like in a real record
}
return items;
// temporalCoverage
function make_temporal(datapub) {
var items = [];
var i = 1;
if( datapub['startDate'] || datapub['endDate'] ) {
const id = '_:temporal/' + String(i);
items.push({
'@id': id,
'identifier': id
})
}
function make_licence(datapub) {
return undefined;
}
module.exports = {
'datapub2catalog': datapub2catalog
};