/* This is part of datacrate, a node.js library for working with DataCrates. Copyright (C) 2018 University of Technology Sydney This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ const fs = require('fs-extra'); const _ = require('lodash'); const path = require('path'); const calcyte = require('calcyte'); const DEFAULTS = path.join(__dirname, "../defaults"); //const CONTEXT = path.join(DEFAULTS, 'context.json'); const GEOCONTEXT = path.join(DEFAULTS, 'geojson-context.json'); const RELATED = [ 'publications', 'websites', 'metadata', 'data', 'services' ]; const RELATED_TYPES = { 'publications': 'ScholarlyArticle', 'websites': 'WebSite', 'metadata': 'CreativeWork', // schema.org hasn't got a dedicated type 'data': 'Dataset', 'services': 'CreativeWork' // I'm not sure about this? }; const ROOT_PATH_BAGGED = 'data/'; const ROOT_PATH_WORKING = './'; const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ]; const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ]; const DEFAULT_IRI_PREFS = { 'about': { 'dc:subject_anzsrc:for': '_:FOR/', 'dc:subject_anzsrc:seo': '_:SEO/' }, 'spatialCoverage': '_:spatial/', 'funder': '_:funder/', 'license': '_:license/', 'citation': '_:citation/', 'contact': '_:contact/', 'location': '_:location/' }; // TODO - a general function for minting new internal IRIs given one // of the above prefixes () // dataset -> about -> subjects // dataset -> spatialCoverage -> place // dataset -> temporalCoverage -> time // dataset -> funder -> organisation // dataset -> license -> licence (CreativeWork or URL) // dataset -> citation -> related publications, websites, articles, datasets // the keys here are schema.org properties on the root DataSet // and the values are functions which take the data publication // as an argument and return null or an array of property values const DATASET_PROPERTIES = { 'about': make_subjects, 'spatialCoverage': make_spatial, 'funder': make_funders, 'license': make_license, 'citation': make_related_works }; /* datapub2catalog(options) Convert metadata from a redbox data publication and its data record to a JSON-LD datacrate catalogue options = { id: redbox oid, datapub: redbox data publication record, org: { @id: organisation identifier, name: organisation name }, owner: email of the data pub's record, approver: email of the librarian who approved publication dates: { dateCreated: Date object or string} datePublished: Date object or string } } If there are no dates, or either dateCreated or datePublished is undefined, now() is used as the default. The owner and approver are used to build two Actions against the object - creation and publication. TODO; add contentSize, encodingFormat and fileFormat from siegfried to the dataLocations array Remember to keep the connection to the data record */ async function datapub2catalog(options) { const id = options['id']; const datapub = options['datapub']; const bagged = options['bagged']; const zip = options['zip']; const org = options['organisation']; const owner = options['owner']; const approver = options['approver']; const dates = dates_default(options['dates']); const prefixes = options['prefixes'] || DEFAULT_IRI_PREFS; const organisation = { '@id': org['id'], '@type': 'Organization', 'identifier': org['id'], 'name': org['name'] }; if( ! (organisation['@id'] && organisation['name'] ) ) { throw Error("Organization must have an id and a name"); } // assumes that all contributors are affiliated to this one organisation // this also makes the data manager const contributors = make_contributors(datapub, organisation); var people = contributors[0]; const files = make_files(datapub); const contactPoint = make_contactPoint(contributors[2]); const dataset = make_dataset(id, datapub, bagged, organisation, dates, files, contributors); const [ history, more_people ] = make_history( dataset, people, owner, approver, dates ); if( more_people.length > 0 ) { people = people.concat(more_people); } people = _.uniqBy(people, '@id'); const graph = _.flatten([ dataset, organisation, people, contactPoint, files, history ]); // these are functions which try to get various DataCrate // properties from the datapub: if they return anything, // the results get pushed onto the graph and added to // the dataset _.forEach(DATASET_PROPERTIES, ( make_items, property ) => { const items = make_items(datapub, prefixes[property]); if( items && items.length > 0 ) { const eitems = _.flatten(items.filter((i) => i)); if( eitems ) { dataset[property] = eitems.map(link_id); graph.push(...eitems); } } }); const context = calcyte.defaults.context; // this isn't working with trim_context, which is clobbering the // geojson types for reasons I don't understand - see PT if( datapub['geospatial'] ) { const geocontext = await fs.readJson(GEOCONTEXT); Object.keys(geocontext['@context']).map((k) => { if( context[k] ) { console.error("Duplicate geocontext for " + k); } context[k] = geocontext[k]; }) } if( zip ) { if( !dataset["distribution"] ) { dataset["distribution"] = [] } dataset["distribution"].push( { "@id": zip } ); graph.push( { "@id": zip, "contentUrl": zip, "@type": "DataDownload", "encodingFormat": "zip" }); context["DataDownload"] = "https://schema.org/DataDownload"; context["distribution"] = "https://schema.org/distribution"; } const catalog = { '@context': context, '@graph': graph.filter((e) => e) }; const jsonld = new calcyte.jsonld(); jsonld.init(catalog); jsonld.trim_context(); return jsonld.json_ld; } function dates_default(values) { var dates = {}; const now = (new Date()).toISOString(); [ 'dateCreated', 'datePublished' ].map((d) => { if( values && values[d] ) { dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d]; } else { dates[d] = now; } }); return dates; } function link_id(item) { return { '@id': item['@id'] }; } // TODO: isBasedOn the data record // always does data/ as the root path function make_dataset(id, datapub, bagged, organisation, dates, files, contributors) { const ds = { '@id': id, '@type': 'Dataset', 'path': bagged ? ROOT_PATH_BAGGED : ROOT_PATH_WORKING, 'name': datapub['title'], 'description': datapub['description'], 'dateCreated': dates['dateCreated'], 'datePublished': dates['datePublished'], 'publisher': link_id(organisation), 'hasPart': files.map(link_id) }; const people = contributors[0]; const roles = contributors[1]; const dm = contributors[2]; people.map(p => { const role = roles[p['@id']]; if( ! (role in ds) ) { ds[role] = []; } ds[role].push(link_id(p)) }); if( files.length > 0 ) { ds['hasPart'] = files.map(link_id); } else { ds['conditionsOfAccess'] = datapub['accessRights_text']; } if( dm ) { ds['contactPoint'] = { '@id': dm['contactPoint'] }; } // see https://schema.org/keywords if( datapub['finalKeywords'] ) { ds['keywords'] = datapub['finalKeywords'].join(', '); } const temporal = make_temporal(datapub); if( temporal ) { ds['temporalCoverage'] = temporal; } return ds; } function find_by_email(people, email) { const match = people.filter((x) => x['email'] == email); if( match.length ) { return match[0]; } else { return undefined; } } function make_history(dataset, people, owner, approver, dates) { var owner_item = find_by_email(people, owner); var approver_item = find_by_email(people, approver); const new_people = [] if( !owner_item ) { owner_item = { '@id': owner, '@type': 'Person', 'email': owner }; new_people.push(owner_item) } if( !approver_item ) { approver_item = { '@id': approver, '@type': 'Person', 'email': approver }; new_people.push(approver_item) } // TODO: descriptions for these and maybe a link to the // data record for the CreateAction const history = [ { '@id': 'history1', '@type': 'CreateAction', 'name': 'Create', 'description': 'Data record created', 'endTime': dates['dateCreated'], 'object': link_id(dataset), 'agent': link_id(owner_item) }, { '@id': 'history2', '@type': 'UpdateAction', 'name': 'Publish', 'endTime': dates['datePublished'], 'object': link_id(dataset), 'agent': link_id(approver_item) } ]; return [ history, new_people ]; } // files(datapub) // crosswalk dataLocations to an array of File items. // The dataLocations are expected to have contentSize, // encodingFormat and fileFormat already set // This assumes that the directory is flat ie all of the files // are in the root function make_files(datapub) { // note: if accessRightsToggle is true, the publication is metadata-only // this sucks and has been flagged for fixing if( !datapub['dataLocations'] || datapub['accessRightsToggle'] ) { return []; } const files = datapub['dataLocations'].map((dl) => { if( dl['selected'] ) { if( dl['type'] === 'attachment' ) { return { '@id': dl['path'], 'path': dl['path'], 'name': dl['name'], '@type': 'File', 'description': dl['notes'], 'contentSize': dl['contentSize'], 'encodingFormat': dl['mimeType'] } } // Not using the url as the @id because there's no guarantee it won't // accidentally match another @id - prefix it if( dl['type'] === 'url' ) { return { '@id': DEFAULT_IRI_PREFS['location'] + dl['location'], 'name': dl['location'], '@type': 'WebSite', 'description': dl['notes'], 'url': dl['location'] } } // physical locations or file paths don't get anything return undefined; } }); return files.filter((x) => x); } // make_contributors(datapub) // // returns the array of Person items from the creators // field of the data publication, and the data manager // also deduplicates the list and returns the contact point // Doesn't add the creators to the datapub - the calling // code is responsible for that. // TODO: affiliations // returns [ people, roles, data_manager_id ] function make_contributors(datapub, organisation) { if( !datapub['creators'] ) { throw Error("A DataCrate has to have at least one creator"); } const roles = { }; // console.log(JSON.stringify(datapub['creators'])); const people = _.compact(datapub['creators'].map(c => make_person(c, organisation))); people.map(p => { roles[p['@id']] = 'creator' }); var datamanager = undefined; if( datapub['contributor_data_manager'] ) { const dm = make_person(datapub['contributor_data_manager'], organisation); const cpid = DEFAULT_IRI_PREFS['contact'] + dm['@id']; dm['contactPoint'] = cpid; if( !(dm['@id'] in roles) ) { console.log("They are not in the roles"); // data manager is not a creator, but a contributor roles[dm['@id']] = 'contributor'; people.push(dm); } else { people.filter(p => p['@id'] === dm['@id']).map(p => { p['contactPoint'] = cpid; }); } datamanager = dm; } const uc = _.uniqBy(people, '@id'); return [ uc, roles, datamanager ]; } function make_person(p, organisation) { const id = p['email'] || p['orcid'] || p['text_full_name']; if( id ) { return { '@id': id, '@type': 'Person', 'identifier': id, 'name': p['text_full_name'], 'familyName': p['family_name'], 'givenName': p['given_name'], 'email': p['email'], 'affiliation': link_id(organisation) } } else { return undefined; } } function make_contactPoint(dm) { const id = DEFAULT_IRI_PREFS['contact'] + dm['@id']; return { '@id': id, '@type': 'ContactPoint', 'contactType': 'Data Manager', 'email': dm['email'], 'identifier': id } } // make related-publication etc links // this returns an array of arrays of items, which will be _.flattened // when the graph is populated // note that these are very simple - just URI, title and description // FIXME this assumes that there's a URI which can be used as the // @id function make_related_works(datapub) { return RELATED.map((thing) => { const field = 'related_' + thing; var values = []; if( field in datapub ) { values = datapub[field]; } return values.filter((f) => f['related_url']).map((item) => { const ld_item = { '@id': item['related_url'], '@type': RELATED_TYPES[thing], 'identifier': item['related_url'], 'name': item['related_title'], }; if( item['related_notes']) { ld_item['description'] = item['related_notes']; } if( item['related_notes']) { ld_item['description'] = item['related_notes']; } return ld_item; }) }) } // Both grants (as projects) and funding bodies are represented // in DataCrates as Organizations with the relationship 'funded' // TODO - can we do chaining as per the spec? // TODO - bring the project in from the RDMP? function make_funders(datapub, prefix) { var f = []; return FUNDING_FIELDS.map((field) => { if( datapub[field] ) { const funders = datapub[field].filter((forg) => forg['dc_identifier']); return funders.map((forg) => { const id = prefix + forg['dc_identifier'][0]; return { '@id': id, 'identifier': id, '@type': 'Organization', 'name': forg['dc_title'] } }); } else { return []; } }); } // note - keywords are added on the dataset item function make_subjects(datapub, prefixes) { return SUBJECT_FIELDS.map((field) => { if( datapub[field] ) { return datapub[field].map((cat) => { const id = prefixes[field] + cat['notation']; return { '@id': id, '@type': 'StructuredValue', 'url': id, 'identifier': id, 'name': cat['name'] }; }) } else { return []; } }); } // spatialCoverage function make_spatial(datapub, prefix) { var items = []; var i = 1; if( datapub['geolocations'] ) { const geols = datapub['geolocations'].filter((g) => !_.isEmpty(g)); items = geols.map((gl) => { const id = prefix + String(i); i += 1; return { '@id': id, 'identifier': id, '@type': 'Place', 'name': gl['basic_name'], 'geo': { '@type': "GeoCoordinates", 'latitude': gl['latitude'], 'longitude': gl['longitude'] } } }); } if( datapub['geospatial'] && datapub['geospatial']['type'] ) { // these are GeoJSON, for which there is a JSON-LD context // defined here: http://geojson.org/geojson-ld/ // Upstairs, the code that calls this checks for a geospatial // item in the datapub and adds the GeoJSON context to the // document before trimming it - see l 177 const id = prefix + String(i); items.push({ '@id': id, 'identifier': id, '@type': datapub['geospatial']['type'], 'name': id, 'features': datapub['geospatial']['features'] }) } return items; } // temporal_coverage is not a linked item but a Text or // DateTime property on the dataset // this merges all of the datapub time properties as follows: // startDate // startDate/endDate // endDate // timePeriod // startDate; timePeriod .. etc function make_temporal(datapub) { var tc = ''; if( datapub['startDate'] ) { tc = datapub['startDate']; if( datapub['endDate'] ) { tc += '/' + datapub['endDate']; } } else if ( datapub['endDate'] ) { tc = datapub['endDate']; } if( datapub['timePeriod'] ) { if( tc ) { tc = tc + '; ' + datapub['timePeriod']; } else { tc = datapub['timePeriod']; } } return tc; } // this assumes that the license_other supersedes // the license_statement function make_license(datapub, prefix) { const licenses = []; if( datapub['license_other_url'] || datapub['license_notes'] ) { if( datapub['license_other_url'] ) { licenses.push({ '@id': DEFAULT_IRI_PREFS['license'] + datapub['license_other_url'], '@type': 'CreativeWork', 'url': datapub['license_other_url'], 'name': ( datapub['license_notes'] || datapub['license_other_url']) }); } else { licenses.push({ '@id': DEFAULT_IRI_PREFS['license'] + 'other', '@type': 'CreativeWork', 'name': datapub['license_notes'] }); } } if( datapub['license_identifier'] && datapub['license_identifier'] !== 'undefined' ) { licenses.push({ '@id': DEFAULT_IRI_PREFS['license'] + datapub['license_identifier'], '@type': 'CreativeWork', 'name': datapub['license_identifier'], 'url': datapub['license_identifier'] }); } if( datapub['accessRights_url']) { licenses.push({ '@id': DEFAULT_IRI_PREFS['license'] + datapub['accessRights_url'], '@type': 'WebSite', 'name': "Conditions of Access", 'url': datapub['accessRights_url'] }); } return licenses; } function make_access_rights(datapub) { } module.exports = { 'datapub2catalog': datapub2catalog };