/* This is part of datacrate, a node.js library for working with DataCrates. Copyright (C) 2018 University of Technology Sydney This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ const fs = require('fs-extra'); const _ = require('lodash'); const path = require('path'); const DEFAULTS = path.join(__dirname, "../defaults"); const CONTEXT = path.join(DEFAULTS, 'context.json'); const RELATED = [ 'publications', 'websites', 'metadata', 'data' ]; const RELATED_TYPES = { 'publications': 'ScholarlyArticle', 'websites': 'WebSite', 'metadata': 'DataSet', // schema.org hasn't got a dedicated type 'data': 'DataSet' }; const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ]; const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ]; // these will do for now const SUBJECT_IRI_PREFIX = { 'dc:subject_anzsrc:for': '_:FOR/', 'dc:subject_anzsrc:seo': '_:SEO/' }; // dataset -> about -> subjects // dataset -> spatialCoverage -> place // dataset -> temporalCoverate -> time // dataset -> funder -> organisation // dataset -> licence -> licence (CreativeWork or URL) // dataset -> citation -> related publications, websites, articles, datasets // the keys here are schema.org properties on the root DataSet // and the values are functions which take the data publication // as an argument and return null or an array of property values const DATASET_PROPERTIES = { 'about': make_subjects, 'spatialCoverage': make_spatial, 'temporalCoverate': make_temporal, 'funder': make_funders, 'licence': make_licence, 'citation': make_related_works }; /* datapub2catalog(options) Convert metadata from a redbox data publication and its data record to a JSON-LD datacrate catalogue options = { id: redbox oid, datapub: redbox data publication record, org: { @id: organisation identifier, name: organisation name }, owner: email of the data pub's record, approver: email of the librarian who approved publication dates: { dateCreated: Date object or string} datePublished: Date object or string } } If there are no dates, or either dateCreated or datePublished is undefined, now() is used as the default. The owner and approver are used to build two Actions against the object - creation and publication. TODO; add contentSize, encodingFormat and fileFormat from siegfried to the dataLocations array Remember to keep the connection to the data record */ async function datapub2catalog(options) { const id = options['id']; const datapub = options['datapub']; const org = options['organisation']; const owner = options['owner']; const approver = options['approver']; const dates = dates_default(options['dates']) const context = await fs.readJson(CONTEXT); const catalog = { '@context': context }; const organisation = { '@id': org['id'], '@type': 'Organization', 'identifier': org['id'], 'name': org['name'] }; if( ! (organisation['@id'] && organisation['name'] ) ) { throw Error("Organization must have an id and a name"); } // assumes that all creators are affiliated to this one organisation var people = make_creators(datapub, organisation); const files = make_files(datapub); const dataset = make_dataset(id, datapub, organisation, dates, files); const [ history, more_people ] = make_history( dataset, people, owner, approver, dates ); if( more_people.length > 0 ) { people = people.concat(more_people); } const graph = _.flatten([ dataset, organisation, people, files, history ]); // these are functions which try to get various DataCrate // properties from the datapub: if they return anything, // the results get pushed onto the graph and added to // the dataset _.forEach(DATASET_PROPERTIES, ( make_items, property ) => { const items = make_items(datapub); if( items ) { const eitems = items.filter((i) => i); if( eitems ) { dataset[property] = items.map(link_id); graph.push(eitems); } } }); return { '@context': context, '@graph': graph.filter((e) => e) }; } function dates_default(values) { var dates = {}; const now = (new Date()).toISOString(); [ 'dateCreated', 'datePublished' ].map((d) => { if( values && values[d] ) { dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d]; } else { dates[d] = now; } }); return dates; } function link_id(item) { return { '@id': item['@id'] }; } // TODO: isBasedOn the data record function make_dataset(id, datapub, organisation, dates, files) { const ds = { '@id': id, '@type': 'Dataset', 'path': './', 'name': datapub['title'], 'description': datapub['description'], 'dateCreated': dates['dateCreated'], 'datePublished': dates['datePublished'], 'publisher': link_id(organisation), 'hasPart': files.map(link_id), }; return ds; } function find_by_email(people, email) { const match = people.filter((x) => x['email'] == email); if( match.length ) { return match[0]; } else { return undefined; } } function make_history(dataset, people, owner, approver, dates) { var owner_item = find_by_email(people, owner); var approver_item = find_by_email(people, approver); const new_people = [] if( !owner_item ) { owner_item = { '@id': owner, '@type': 'Person', 'email': owner }; new_people.push(owner_item) } if( !approver_item ) { approver_item = { '@id': approver, '@type': 'Person', 'email': approver }; new_people.push(approver_item) } // TODO: descriptions for these and maybe a link to the // data record for the CreateAction const history = [ { '@id': dataset['@id'] + '_history1', '@type': 'CreateAction', 'name': 'Create', 'description': 'Data record created', 'endTime': dates['dateCreated'], 'object': link_id(dataset), 'agent': link_id(owner_item) }, { '@id': dataset['@id'] + '_history2', '@type': 'UpdateAction', 'name': 'Publish', 'endTime': dates['datePublished'], 'object': link_id(dataset), 'agent': link_id(approver_item) } ]; return [ history, new_people ]; } // files(datapub) // crosswalk dataLocations to an array of File items. // The dataLocations are expected to have contentSize, // encodingFormat and fileFormat already set // This assumes that the directory is flat ie all of the files // are in the root function make_files(datapub) { if( !datapub['dataLocations'] ) { return []; } const files = datapub['dataLocations'].map((dl) => { if( dl['type'] == 'attachment' ) { return { '@id': dl['name'], 'name': dl['name'], 'path': dl['name'], '@type': 'File', 'contentSize': dl['contentSize'], 'encodingFormat': dl['encodingFormat'], 'fileFormat': dl['fileFormat'] } } else { // todo: URLs and physical locations return undefined; } }); return files.filter((x) => x); } // creators(datapub) // // returns the array of Person items from the creators // field of the data publication // TODO: affiliations function make_creators(datapub, organisation) { if( !datapub['creators'] ) { throw Error("A DataCrate has to have at least one creator"); } const creators = datapub['creators'].map((p) => { const id = p['orcid'] || p['email'] || p['text_full_name']; if( id ) { return { '@id': id, '@type': 'Person', 'identifier': id, 'name': p['text_full_name'], 'familyName': p['family_name'], 'givenName': p['given_name'], 'email': p['email'], 'affiliation': link_id(organisation) } } else { // warn for unidentifiable creators return undefined; } }) const nc = creators.filter((x) => x) return nc; } // make related-publication etc links // this returns an array of arrays of items, which will be _.flattened // when the graph is populated // note that these are very simple - just URI, title and description // FIXME this assumes that there's a URI which can be used as the // @id function make_related_works(datapub) { return RELATED.map((thing) => { const field = 'related_' + thing; return datapub[field].map((item) => { const ld_item = { '@id': item['related_url'], '@type': RELATED_TYPES[thing], 'identifier': item['related_url'], 'name': item['related_title'], }; if( item['related_notes']) { ld_item['description'] = item['related_notes']; } if( item['related_notes']) { ld_item['description'] = item['related_notes']; } return ld_item; }) }) } // Both grants (as projects) and funding bodies are represented // in DataCrates as Organizations with the relationship 'funded' // TODO - can we do chaining as per the spec? // TODO - bring the project in from the RDMP? function make_funders(datapub) { var f = []; return FUNDING_FIELDS.map((field) => { if( datapub[field] ) { return datapub[field].map((forg) => { return { '@id': forg['dc_identifier'], 'identifier': forg['dc_identifier'], '@type': 'Organization', 'name': forg['dc_title'] } }); } else { return []; } }); } function make_subjects(datapub) { return SUBJECT_FIELDS.map((field) => { if( datapub[field] ) { return datapub[field].map((subj) => { const id = SUBJECT_IRI_PREFIX[field] + subj; return { '@id': id, 'identifier': id, 'name': subj }; }) } }); } // spatialCoverage function make_spatial(datapub) { var items = []; var i = 1; if( datapub['geolocations'] ) { items = datapub['geolocations'].map((gl) => { const id = '_:spatial/' + String(i); i += 1; return { '@id': id, 'identifier': id, 'description': gl } }); } if( datapub['geospatial'] ) { // FIXME - need to see what this looks like in a real record } return items; } // temporalCoverage function make_temporal(datapub) { var items = []; var i = 1; if( datapub['startDate'] || datapub['endDate'] ) { const id = '_:temporal/' + String(i); items.push({ '@id': id, 'identifier': id }) } } function make_licence(datapub) { return undefined; } module.exports = { 'datapub2catalog': datapub2catalog };