Newer
Older
/*
This is part of datacrate, a node.js library for working with
DataCrates. Copyright (C) 2018 University of Technology Sydney
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const fs = require('fs-extra');
const _ = require('lodash');
const DEFAULTS = path.join(__dirname, "../defaults");
const CONTEXT = path.join(DEFAULTS, 'context.json');
const RELATED = [ 'publications', 'websites', 'metadata', 'data', 'services' ];
const RELATED_TYPES = {
'publications': 'ScholarlyArticle',
'websites': 'WebSite',
'metadata': 'CreativeWork', // schema.org hasn't got a dedicated type
'services': 'CreativeWork' // I'm not sure about this?
const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ];
const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ];
const DEFAULT_IRI_PREFS = {
'about': {
'dc:subject_anzsrc:for': '_:FOR/',
'dc:subject_anzsrc:seo': '_:SEO/'
},
'spatialCoverage': '_:spatial/',
'funder': '_:funder/',
'citation': '_:citation/'
// dataset -> about -> subjects
// dataset -> spatialCoverage -> place
// dataset -> temporalCoverage -> time
// dataset -> funder -> organisation
// dataset -> license -> licence (CreativeWork or URL)
// dataset -> citation -> related publications, websites, articles, datasets
// the keys here are schema.org properties on the root DataSet
// and the values are functions which take the data publication
// as an argument and return null or an array of property values
const DATASET_PROPERTIES = {
'about': make_subjects,
'spatialCoverage': make_spatial,
'funder': make_funders,
'citation': make_related_works
};
/* datapub2catalog(options)
Convert metadata from a redbox data publication and
its data record to a JSON-LD datacrate catalogue
options = {
id: redbox oid,
datapub: redbox data publication record,
org: {
@id: organisation identifier,
name: organisation name
},
owner: email of the data pub's record,
approver: email of the librarian who approved publication
dates: {
dateCreated: Date object or string}
datePublished: Date object or string
}
}
If there are no dates, or either dateCreated or datePublished is
undefined, now() is used as the default.
The owner and approver are used to build two Actions against
the object - creation and publication.
TODO; add contentSize, encodingFormat and fileFormat from
siegfried to the dataLocations array
Remember to keep the connection to the data record
*/
async function datapub2catalog(options) {
const id = options['id'];
const datapub = options['datapub'];
const org = options['organisation'];
const owner = options['owner'];
const approver = options['approver'];
const dates = dates_default(options['dates'])
const prefixes = options['prefixes'] || DEFAULT_IRI_PREFS;
const context = await fs.readJson(CONTEXT);
const catalog = {
'@context': context
};
const organisation = {
'@type': 'Organization',
'name': org['name']
};
if( ! (organisation['@id'] && organisation['name'] ) ) {
throw Error("Organization must have an id and a name");
}
// assumes that all creators are affiliated to this one organisation
var people = make_creators(datapub, organisation);
const files = make_files(datapub);
const dataset = make_dataset(id, datapub, organisation, dates, files);
const [ history, more_people ] = make_history(
dataset, people, owner, approver, dates
);
if( more_people.length > 0 ) {
people = people.concat(more_people);
}
const graph = _.flatten([
dataset,
organisation,
history
]);
// these are functions which try to get various DataCrate
// properties from the datapub: if they return anything,
// the results get pushed onto the graph and added to
// the dataset
_.forEach(DATASET_PROPERTIES, ( make_items, property ) => {
const items = make_items(datapub, prefixes[property]);
if( items && items.length > 0 ) {
Mike Lynch
committed
const eitems = _.flatten(items.filter((i) => i));
if( eitems ) {
Mike Lynch
committed
dataset[property] = eitems.map(link_id);
graph.push(...eitems);
}
}
});
return {
'@context': context,
'@graph': graph.filter((e) => e)
};
function dates_default(values) {
var dates = {};
const now = (new Date()).toISOString();
[ 'dateCreated', 'datePublished' ].map((d) => {
if( values && values[d] ) {
dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d];
} else {
dates[d] = now;
}
});
return dates;
}
function link_id(item) {
return { '@id': item['@id'] };
}
// TODO: isBasedOn the data record
function make_dataset(id, datapub, organisation, dates, files) {
const ds = {
'@id': id,
'name': datapub['title'],
'description': datapub['description'],
'dateCreated': dates['dateCreated'],
'datePublished': dates['datePublished'],
'publisher': link_id(organisation),
'hasPart': files.map(link_id),
};
// see https://schema.org/keywords
if( datapub['finalKeywords'] ) {
ds['keywords'] = datapub['finalKeywords'].join(', ');
}
const temporal = make_temporal(datapub);
if( temporal ) {
ds['temporalCoverage'] = temporal;
}
return ds;
function find_by_email(people, email) {
const match = people.filter((x) => x['email'] == email);
if( match.length ) {
return match[0];
} else {
return undefined;
}
}
function make_history(dataset, people, owner, approver, dates) {
var owner_item = find_by_email(people, owner);
var approver_item = find_by_email(people, approver);
const new_people = []
if( !owner_item ) {
owner_item = {
'@id': owner,
'email': owner
};
new_people.push(owner_item)
}
if( !approver_item ) {
approver_item = {
'@id': approver,
'email': approver
};
new_people.push(approver_item)
}
// TODO: descriptions for these and maybe a link to the
// data record for the CreateAction
const history = [
{
'@id': dataset['@id'] + '_history1',
'@type': 'CreateAction',
'name': 'Create',
'endTime': dates['dateCreated'],
'object': link_id(dataset),
'agent': link_id(owner_item)
},
{
'@id': dataset['@id'] + '_history2',
'@type': 'UpdateAction',
'name': 'Publish',
'endTime': dates['datePublished'],
'object': link_id(dataset),
'agent': link_id(approver_item)
}
];
return [ history, new_people ];
}
// files(datapub)
// crosswalk dataLocations to an array of File items.
// The dataLocations are expected to have contentSize,
// encodingFormat and fileFormat already set
// This assumes that the directory is flat ie all of the files
// are in the root
function make_files(datapub) {
if( !datapub['dataLocations'] ) {
return [];
}
const files = datapub['dataLocations'].map((dl) => {
if( dl['type'] == 'attachment' ) {
return {
'@id': dl['name'],
'name': dl['name'],
'@type': 'File',
'contentSize': dl['contentSize'],
}
} else {
// todo: URLs and physical locations
return undefined;
}
});
return files.filter((x) => x);
}
// creators(datapub)
//
// returns the array of Person items from the creators
// field of the data publication
// TODO: affiliations
function make_creators(datapub, organisation) {
if( !datapub['creators'] ) {
throw Error("A DataCrate has to have at least one creator");
}
const creators = datapub['creators'].map((p) => {
const id = p['orcid'] || p['email'] || p['text_full_name'];
'identifier': id,
'name': p['text_full_name'],
'familyName': p['family_name'],
'givenName': p['given_name'],
'email': p['email'],
'affiliation': link_id(organisation)
}
} else {
// warn for unidentifiable creators
return undefined;
const nc = creators.filter((x) => x)
return nc;
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// make related-publication etc links
// this returns an array of arrays of items, which will be _.flattened
// when the graph is populated
// note that these are very simple - just URI, title and description
// FIXME this assumes that there's a URI which can be used as the
// @id
function make_related_works(datapub) {
return RELATED.map((thing) => {
const field = 'related_' + thing;
return datapub[field].map((item) => {
const ld_item = {
'@id': item['related_url'],
'@type': RELATED_TYPES[thing],
'identifier': item['related_url'],
'name': item['related_title'],
};
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
return ld_item;
})
})
}
// Both grants (as projects) and funding bodies are represented
// in DataCrates as Organizations with the relationship 'funded'
// TODO - can we do chaining as per the spec?
// TODO - bring the project in from the RDMP?
function make_funders(datapub, prefix) {
var f = [];
return FUNDING_FIELDS.map((field) => {
if( datapub[field] ) {
const funders = datapub[field].filter((forg) => forg['dc_identifier']);
return funders.map((forg) => {
const id = prefix + forg['dc_identifier'][0];
'@id': id,
'identifier': id,
'@type': 'Organization',
'name': forg['dc_title']
}
});
} else {
return [];
}
});
// note - keywords are added on the dataset item
function make_subjects(datapub, prefixes) {
return SUBJECT_FIELDS.map((field) => {
if( datapub[field] ) {
const id = prefixes[field] + cat['notation'];
return {
'@id': id,
'identifier': id,
};
})
}
});
// spatialCoverage
function make_spatial(datapub, prefix) {
var items = [];
var i = 1;
if( datapub['geolocations'] ) {
items = datapub['geolocations'].map((gl) => {
const id = prefix + String(i);
i += 1;
return {
'@id': id,
'identifier': id,
'@type': 'Place',
'name': gl['basic_name'],
'geo': {
'@type': "GeoCoordinates",
'latitude': gl['latitude'],
'longitude': gl['longitude']
}
}
});
}
if( datapub['geospatial'] ) {
// these are GeoJSON, for which there is a JSON-LD context
// defined here: http://geojson.org/geojson-ld/
// TODO: append it to the context.
// for now I'm just passing it through to an item with the
// @type from the GeoJSON type
const id = prefix + String(i);
items.push({
'@id': id,
'identifier': id,
'@type': datapub['geospatial']['type'],
'name': id,
'features': datapub['geospatial']['features']
})
}
return items;
// temporal_coverage is not a linked item but a Text or
// DateTime property on the dataset
// this merges all of the datapub time properties as follows:
// startDate
// startDate/endDate
// endDate
// timePeriod
// startDate; timePeriod .. etc
function make_temporal(datapub) {
var tc = '';
if( datapub['startDate'] ) {
tc = datapub['startDate'];
if( datapub['endDate'] ) {
tc += '/' + datapub['endDate'];
}
} else if ( datapub['endDate'] ) {
tc = datapub['endDate'];
}
if( datapub['timePeriod'] ) {
if( tc ) {
tc = tc + '; ' + datapub['timePeriod'];
} else {
tc = datapub['timePeriod'];
}
return tc;
// this assumes that the license_other supersedes
// the license_statement
function make_license(datapub, prefix) {
if( datapub['license_other_url'] ) {
return [{
'@id': datapub['license_other_url'],
'type': 'CreativeWork',
'name': datapub['license_notes']
}];
}
if( datapub['license_statement'] ) {
return [{
'@id': datapub['license_identifier'],
'type': 'CreativeWork',
'name': datapub['license_statement'],
'url': datapub['license_statement_url']
}];
}
return [];
}
module.exports = {
'datapub2catalog': datapub2catalog
};