Newer
Older
/*
This is part of datacrate, a node.js library for working with
DataCrates. Copyright (C) 2018 University of Technology Sydney
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
const fs = require('fs-extra');
const _ = require('lodash');
const calcyte = require('calcyte');
const DEFAULTS = path.join(__dirname, "../defaults");
//const CONTEXT = path.join(DEFAULTS, 'context.json');
const GEOCONTEXT = path.join(DEFAULTS, 'geojson-context.json');
const RELATED = [ 'publications', 'websites', 'metadata', 'data', 'services' ];
const RELATED_TYPES = {
'publications': 'ScholarlyArticle',
'websites': 'WebSite',
'metadata': 'CreativeWork', // schema.org hasn't got a dedicated type
'data': 'Dataset',
'services': 'CreativeWork' // I'm not sure about this?
const ROOT_PATH_BAGGED = 'data/';
const ROOT_PATH_WORKING = './';
const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ];
const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ];
const DEFAULT_IRI_PREFS = {
'about': {
'dc:subject_anzsrc:for': '_:FOR/',
'dc:subject_anzsrc:seo': '_:SEO/'
},
'spatialCoverage': '_:spatial/',
'funder': '_:funder/',
'citation': '_:citation/'
// TODO - a general function for minting new internal IRIs given one
// of the above prefixes ()
// dataset -> about -> subjects
// dataset -> spatialCoverage -> place
// dataset -> temporalCoverage -> time
// dataset -> funder -> organisation
// dataset -> license -> licence (CreativeWork or URL)
// dataset -> citation -> related publications, websites, articles, datasets
// the keys here are schema.org properties on the root DataSet
// and the values are functions which take the data publication
// as an argument and return null or an array of property values
const DATASET_PROPERTIES = {
'about': make_subjects,
'spatialCoverage': make_spatial,
'funder': make_funders,
'citation': make_related_works
};
/* datapub2catalog(options)
Convert metadata from a redbox data publication and
its data record to a JSON-LD datacrate catalogue
options = {
id: redbox oid,
datapub: redbox data publication record,
org: {
@id: organisation identifier,
name: organisation name
},
owner: email of the data pub's record,
approver: email of the librarian who approved publication
dates: {
dateCreated: Date object or string}
datePublished: Date object or string
}
}
If there are no dates, or either dateCreated or datePublished is
undefined, now() is used as the default.
The owner and approver are used to build two Actions against
the object - creation and publication.
TODO; add contentSize, encodingFormat and fileFormat from
siegfried to the dataLocations array
Remember to keep the connection to the data record
*/
async function datapub2catalog(options) {
const id = options['id'];
const datapub = options['datapub'];
const bagged = options['bagged'];
const zip = options['zip'];
const org = options['organisation'];
const owner = options['owner'];
const approver = options['approver'];
const dates = dates_default(options['dates']);
const prefixes = options['prefixes'] || DEFAULT_IRI_PREFS;
const organisation = {
'@type': 'Organization',
'name': org['name']
};
if( ! (organisation['@id'] && organisation['name'] ) ) {
throw Error("Organization must have an id and a name");
}
// assumes that all creators are affiliated to this one organisation
var people = make_creators(datapub, organisation);
const files = make_files(datapub);
const dataset = make_dataset(id, datapub, bagged, organisation, dates, files, people);
const [ history, more_people ] = make_history(
dataset, people, owner, approver, dates
);
if( more_people.length > 0 ) {
people = people.concat(more_people);
}
const graph = _.flatten([
dataset,
organisation,
history
]);
// these are functions which try to get various DataCrate
// properties from the datapub: if they return anything,
// the results get pushed onto the graph and added to
// the dataset
_.forEach(DATASET_PROPERTIES, ( make_items, property ) => {
const items = make_items(datapub, prefixes[property]);
if( items && items.length > 0 ) {
Mike Lynch
committed
const eitems = _.flatten(items.filter((i) => i));
if( eitems ) {
Mike Lynch
committed
dataset[property] = eitems.map(link_id);
graph.push(...eitems);
}
}
});
const context = calcyte.defaults.context;
// this isn't working with trim_context, which is clobbering the
// geojson types for reasons I don't understand - see PT
if( datapub['geospatial'] ) {
const geocontext = await fs.readJson(GEOCONTEXT);
Object.keys(geocontext['@context']).map((k) => {
if( context[k] ) {
console.error("Duplicate geocontext for " + k);
}
context[k] = geocontext[k];
})
}
if( zip ) {
if( !dataset["distribution"] ) {
dataset["distribution"] = []
}
dataset["distribution"].push(
{
"@id": zip
}
);
graph.push(
{
"@id": zip,
"contentUrl": zip,
"@type": "DataDownload",
"encodingFormat": "zip"
});
context["DataDownload"] = "https://schema.org/DataDownload";
context["distribution"] = "https://schema.org/distribution";
}
const catalog = {
'@context': context,
'@graph': graph.filter((e) => e)
};
const jsonld = new calcyte.jsonld();
jsonld.init(catalog);
jsonld.trim_context();
return jsonld.json_ld;
function dates_default(values) {
var dates = {};
const now = (new Date()).toISOString();
[ 'dateCreated', 'datePublished' ].map((d) => {
if( values && values[d] ) {
dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d];
} else {
dates[d] = now;
}
});
return dates;
}
function link_id(item) {
return { '@id': item['@id'] };
}
// TODO: isBasedOn the data record
// always does data/ as the root path
function make_dataset(id, datapub, bagged, organisation, dates, files, creators) {
const ds = {
'@id': id,
'path': bagged ? ROOT_PATH_BAGGED : ROOT_PATH_WORKING,
'name': datapub['title'],
'description': datapub['description'],
'dateCreated': dates['dateCreated'],
'datePublished': dates['datePublished'],
'publisher': link_id(organisation),
'hasPart': files.map(link_id),
'creator': creators.map(link_id)
};
// see https://schema.org/keywords
if( datapub['finalKeywords'] ) {
ds['keywords'] = datapub['finalKeywords'].join(', ');
}
const temporal = make_temporal(datapub);
if( temporal ) {
ds['temporalCoverage'] = temporal;
}
return ds;
function find_by_email(people, email) {
const match = people.filter((x) => x['email'] == email);
if( match.length ) {
return match[0];
} else {
return undefined;
}
}
function make_history(dataset, people, owner, approver, dates) {
var owner_item = find_by_email(people, owner);
var approver_item = find_by_email(people, approver);
const new_people = []
if( !owner_item ) {
owner_item = {
'@id': owner,
'email': owner
};
new_people.push(owner_item)
}
if( !approver_item ) {
approver_item = {
'@id': approver,
'email': approver
};
new_people.push(approver_item)
}
// TODO: descriptions for these and maybe a link to the
// data record for the CreateAction
const history = [
{
'@type': 'CreateAction',
'name': 'Create',
'endTime': dates['dateCreated'],
'object': link_id(dataset),
'agent': link_id(owner_item)
},
{
'@type': 'UpdateAction',
'name': 'Publish',
'endTime': dates['datePublished'],
'object': link_id(dataset),
'agent': link_id(approver_item)
}
];
return [ history, new_people ];
}
// files(datapub)
// crosswalk dataLocations to an array of File items.
// The dataLocations are expected to have contentSize,
// encodingFormat and fileFormat already set
// This assumes that the directory is flat ie all of the files
// are in the root
function make_files(datapub) {
if( !datapub['dataLocations'] ) {
return [];
}
const files = datapub['dataLocations'].map((dl) => {
if( dl['type'] == 'attachment' ) {
return {
'@id': dl['name'],
'name': dl['name'],
'@type': 'File',
'contentSize': dl['contentSize'],
}
} else {
// todo: URLs and physical locations
return undefined;
}
});
return files.filter((x) => x);
}
// creators(datapub)
//
// returns the array of Person items from the creators
// field of the data publication
// Doesn't add the creators to the datapub - the calling
// code is responsible for that.
// TODO: affiliations
function make_creators(datapub, organisation) {
if( !datapub['creators'] ) {
throw Error("A DataCrate has to have at least one creator");
}
const creators = datapub['creators'].map((p) => {
const id = p['orcid'] || p['email'] || p['text_full_name'];
'identifier': id,
'name': p['text_full_name'],
'familyName': p['family_name'],
'givenName': p['given_name'],
'email': p['email'],
'affiliation': link_id(organisation)
}
} else {
// warn for unidentifiable creators
return undefined;
const nc = creators.filter((x) => x)
return nc;
// make related-publication etc links
// this returns an array of arrays of items, which will be _.flattened
// when the graph is populated
// note that these are very simple - just URI, title and description
// FIXME this assumes that there's a URI which can be used as the
// @id
function make_related_works(datapub) {
return RELATED.map((thing) => {
const field = 'related_' + thing;
var values = [];
if( field in datapub ) {
values = datapub[field];
}
return values.filter((f) => f['related_url']).map((item) => {
const ld_item = {
'@id': item['related_url'],
'@type': RELATED_TYPES[thing],
'identifier': item['related_url'],
'name': item['related_title'],
};
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
if( item['related_notes']) {
ld_item['description'] = item['related_notes'];
}
return ld_item;
})
})
}
// Both grants (as projects) and funding bodies are represented
// in DataCrates as Organizations with the relationship 'funded'
// TODO - can we do chaining as per the spec?
// TODO - bring the project in from the RDMP?
function make_funders(datapub, prefix) {
var f = [];
return FUNDING_FIELDS.map((field) => {
if( datapub[field] ) {
const funders = datapub[field].filter((forg) => forg['dc_identifier']);
return funders.map((forg) => {
const id = prefix + forg['dc_identifier'][0];
'@id': id,
'identifier': id,
'@type': 'Organization',
'name': forg['dc_title']
}
});
} else {
return [];
}
});
// note - keywords are added on the dataset item
function make_subjects(datapub, prefixes) {
return SUBJECT_FIELDS.map((field) => {
if( datapub[field] ) {
const id = prefixes[field] + cat['notation'];
return {
'@id': id,
'identifier': id,
};
})
}
});
// spatialCoverage
function make_spatial(datapub, prefix) {
var items = [];
var i = 1;
if( datapub['geolocations'] ) {
const geols = datapub['geolocations'].filter((g) => !_.isEmpty(g));
items = geols.map((gl) => {
const id = prefix + String(i);
i += 1;
return {
'@id': id,
'identifier': id,
'@type': 'Place',
'name': gl['basic_name'],
'geo': {
'@type': "GeoCoordinates",
'latitude': gl['latitude'],
'longitude': gl['longitude']
}
}
});
}
if( datapub['geospatial'] && datapub['geospatial']['type'] ) {
// these are GeoJSON, for which there is a JSON-LD context
// defined here: http://geojson.org/geojson-ld/
// Upstairs, the code that calls this checks for a geospatial
// item in the datapub and adds the GeoJSON context to the
// document before trimming it - see l 177
const id = prefix + String(i);
items.push({
'@id': id,
'identifier': id,
'@type': datapub['geospatial']['type'],
'name': id,
'features': datapub['geospatial']['features']
})
}
return items;
// temporal_coverage is not a linked item but a Text or
// DateTime property on the dataset
// this merges all of the datapub time properties as follows:
// startDate
// startDate/endDate
// endDate
// timePeriod
// startDate; timePeriod .. etc
function make_temporal(datapub) {
var tc = '';
if( datapub['startDate'] ) {
tc = datapub['startDate'];
if( datapub['endDate'] ) {
tc += '/' + datapub['endDate'];
}
} else if ( datapub['endDate'] ) {
tc = datapub['endDate'];
}
if( datapub['timePeriod'] ) {
if( tc ) {
tc = tc + '; ' + datapub['timePeriod'];
} else {
tc = datapub['timePeriod'];
}
return tc;
// this assumes that the license_other supersedes
// the license_statement
function make_license(datapub, prefix) {
if( datapub['license_other_url'] || datapub['license_notes'] ) {
if( datapub['license_other_url'] ) {
licenses.push({
'@id': datapub['license_other_url'],
'@type': 'CreativeWork',
'url': datapub['license_other_url'],
'name': ( datapub['license_notes'] || datapub['license_other_url'])
});
} else {
licenses.push({
'@id': DEFAULT_IRI_PREFS['licenses'] + 'other',
'@type': 'CreativeWork',
'name': datapub['license_notes']
});
}
if( datapub['license_identifier'] ) {
licenses.push({
'@type': 'CreativeWork',
'name': datapub['license_identifier'],
'url': datapub['license_identifier']
});
return licenses;
}
module.exports = {
'datapub2catalog': datapub2catalog
};