catalog.js

/*
This is part of datacrate, a node.js library for working with 
DataCrates.  Copyright (C) 2018  University of Technology Sydney

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


const fs = require('fs-extra');
const _ = require('lodash');
const path = require('path');

const calcyte = require('calcyte');


const DEFAULTS = path.join(__dirname, "../defaults");
//const CONTEXT = path.join(DEFAULTS, 'context.json');
const GEOCONTEXT = path.join(DEFAULTS, 'geojson-context.json');

const RELATED = [ 'publications', 'websites', 'metadata', 'data', 'services' ];

const RELATED_TYPES = {
	'publications': 'ScholarlyArticle',
	'websites': 'WebSite',
	'metadata': 'CreativeWork',  // schema.org hasn't got a dedicated type
	'data': 'Dataset',
	'services': 'CreativeWork' // I'm not sure about this?
};

const ROOT_PATH_BAGGED = 'data/';
const ROOT_PATH_WORKING = './';

const FUNDING_FIELDS = [ 'foaf:fundedBy_foaf:Agent', 'foaf:fundedBy_vivo:Grant' ];


const SUBJECT_FIELDS = [ 'dc:subject_anzsrc:for', 'dc:subject_anzsrc:seo' ];


const DEFAULT_IRI_PREFS = {
	'about': {
	    'dc:subject_anzsrc:for': '_:FOR/',
    	'dc:subject_anzsrc:seo': '_:SEO/'
	},
	'spatialCoverage': '_:spatial/',
	'funder': '_:funder/',
	'license': '_:license/',
	'citation': '_:citation/'
};


// TODO - a general function for minting new internal IRIs given one 
// of the above prefixes ()


	// dataset -> about -> subjects
	// dataset -> spatialCoverage -> place
	// dataset -> temporalCoverage -> time
	// dataset -> funder -> organisation
	// dataset -> license -> licence (CreativeWork or URL)
	// dataset -> citation -> related publications, websites, articles, datasets


// the keys here are schema.org properties on the root DataSet
// and the values are functions which take the data publication
// as an argument and return null or an array of property values

const DATASET_PROPERTIES = {
	'about': make_subjects,
	'spatialCoverage': make_spatial,
	'funder': make_funders,
	'license': make_license,
	'citation': make_related_works
};

/* datapub2catalog(options)

Convert metadata from a redbox data publication and
its data record to a JSON-LD datacrate catalogue 

options = {
	id: redbox oid,
	datapub: redbox data publication record,
	org: {
	  @id: organisation identifier,
	  name: organisation name
	},
	owner: email of the data pub's record,
	approver: email of the librarian who approved publication
	dates: {
	    dateCreated: Date object or string}
	    datePublished: Date object or string
	}
}

If there are no dates, or either dateCreated or datePublished is 
undefined, now() is used as the default.

The owner and approver are used to build two Actions against
the object - creation and publication.


TODO; add contentSize, encodingFormat and fileFormat from
siegfried to the dataLocations array

Remember to keep the connection to the data record

*/


async function datapub2catalog(options) {

	const id = options['id'];
	const datapub = options['datapub'];
	const bagged = options['bagged'];
	const zip = options['zip'];
	const org = options['organisation'];
	const owner = options['owner'];
	const approver = options['approver'];
	const dates = dates_default(options['dates']);
	const prefixes = options['prefixes'] || DEFAULT_IRI_PREFS;

	const organisation = { 
		'@id': org['id'],
		'@type': 'Organization',
		'identifier': org['id'],
		'name': org['name']
	};

	if( ! (organisation['@id'] && organisation['name'] ) ) {
		throw Error("Organization must have an id and a name");
	}

	// assumes that all creators are affiliated to this one organisation

	var people = make_creators(datapub, organisation);

	const files = make_files(datapub);
	const dataset = make_dataset(id, datapub, bagged, organisation, dates, files, people);
	const [ history, more_people ] = make_history(
		dataset, people, owner, approver, dates
	);

	if( more_people.length > 0 ) {
		people = people.concat(more_people);
	}

	const graph = _.flatten([
		dataset,
		organisation,
		people,
		files,
		history
	]);

	// these are functions which try to get various DataCrate
	// properties from the datapub: if they return anything,
	// the results get pushed onto the graph and added to
	// the dataset

	_.forEach(DATASET_PROPERTIES, ( make_items, property ) => {
		const items = make_items(datapub, prefixes[property]);
		if( items && items.length > 0 ) {
			const eitems = _.flatten(items.filter((i) => i));
			if( eitems ) {
				dataset[property] = eitems.map(link_id);
				graph.push(...eitems);
			}
		}
	});

	const context = calcyte.defaults.context;


    // this isn't working with trim_context, which is clobbering the
    // geojson types for reasons I don't understand - see PT

	if( datapub['geospatial'] ) {
		const geocontext = await fs.readJson(GEOCONTEXT);
		Object.keys(geocontext['@context']).map((k) => {
			if( context[k] ) {
				console.error("Duplicate geocontext for " + k);
			}
			context[k] = geocontext[k];
		})
	}

    if( zip ) {
      if( !dataset["distribution"] ) {
        dataset["distribution"] = []
      }
      dataset["distribution"].push(
        {
          "@id": zip
        }
      );
      graph.push(
        {
          "@id": zip,
          "contentUrl": zip,
          "@type": "DataDownload",
          "encodingFormat": "zip"
       });
       context["DataDownload"] = "https://schema.org/DataDownload";
       context["distribution"] = "https://schema.org/distribution";
    }

	const catalog = {
		'@context': context,
		'@graph': graph.filter((e) => e)
	};

	const jsonld = new calcyte.jsonld();
	jsonld.init(catalog);
	jsonld.trim_context();
	return jsonld.json_ld;

}


function dates_default(values) {
	var dates = {};
	const now = (new Date()).toISOString();
	[ 'dateCreated', 'datePublished' ].map((d) => {
		if( values && values[d] ) {
			dates[d] = ( typeof(values[d]) === 'object' ) ? values[d].toISOString() : values[d];
		} else {
			dates[d] = now;
		}
	});
	return dates;
}

function link_id(item) {
	return { '@id': item['@id'] };
}


// TODO: isBasedOn the data record

// always does data/ as the root path

function make_dataset(id, datapub, bagged, organisation, dates, files, creators) {
	const ds = {
		'@id': id,
		'@type': 'Dataset',
		'path': bagged ? ROOT_PATH_BAGGED : ROOT_PATH_WORKING,
		'name': datapub['title'],
		'description': datapub['description'],
		'dateCreated': dates['dateCreated'],
		'datePublished': dates['datePublished'],
		'publisher': link_id(organisation),
		'hasPart': files.map(link_id),
		'creator': creators.map(link_id)
	};
	// see https://schema.org/keywords
	if( datapub['finalKeywords'] ) {
		ds['keywords'] = datapub['finalKeywords'].join(', ');
	}
	const temporal = make_temporal(datapub);
	if( temporal ) {
		ds['temporalCoverage'] = temporal;
	}
	return ds;
}

function find_by_email(people, email) {
	const match = people.filter((x) => x['email'] == email);
	if( match.length ) {
		return match[0];
	} else {
		return undefined;
	}
}	

function make_history(dataset, people, owner, approver, dates) {
	var owner_item = find_by_email(people, owner);
	var approver_item = find_by_email(people, approver);
	const new_people = []
	if( !owner_item ) {
		owner_item = { 
			'@id': owner,
			'@type': 'Person',
			'email': owner
		};
		new_people.push(owner_item)
	}
	if( !approver_item ) {
		approver_item = { 
			'@id': approver,
			'@type': 'Person',
			'email': approver
		};
		new_people.push(approver_item)
	}

	// TODO: descriptions for these and maybe a link to the 
	// data record for the CreateAction

	
  const history = [
		{
			'@id': 'history1',
			'@type': 'CreateAction',
			'name': 'Create',
			'description': 'Data record created',
			'endTime': dates['dateCreated'],
			'object': link_id(dataset),
			'agent': link_id(owner_item)
		},
		{
			'@id': 'history2',
			'@type': 'UpdateAction',
			'name': 'Publish',
			'endTime': dates['datePublished'],
			'object': link_id(dataset),
			'agent': link_id(approver_item)
		}
	];
	return [ history, new_people ];
}


// files(datapub)

// crosswalk dataLocations to an array of File items.
// The dataLocations are expected to have contentSize, 
// encodingFormat and fileFormat already set

// This assumes that the directory is flat ie all of the files
// are in the root

function make_files(datapub) {
	if( !datapub['dataLocations'] ) {
		return [];
	}
	const files = datapub['dataLocations'].map((dl) => {
		if( dl['type'] == 'attachment' ) {
			return {
				'@id': dl['name'],
				'name': dl['name'],
				'path': dl['name'],
				'@type': 'File',
				'description': dl['notes'],
				'contentSize': dl['contentSize'],
				'encodingFormat': dl['mimeType']
			}
		} else {
			// todo: URLs and physical locations
			return undefined;
		}
	});
	return files.filter((x) => x);
}

// creators(datapub)
//
// returns the array of Person items from the creators
// field of the data publication

// Doesn't add the creators to the datapub - the calling
// code is responsible for that.

// TODO: affiliations

function make_creators(datapub, organisation) {
	if( !datapub['creators'] ) {
		throw Error("A DataCrate has to have at least one creator");
	}	
	const creators = datapub['creators'].map((p) => {
		const id = p['orcid'] || p['email'] || p['text_full_name'];
		if( id ) {
			return {
				'@id': id,
				'@type': 'Person',
				'identifier': id,
				'name': p['text_full_name'],
				'familyName': p['family_name'],
				'givenName': p['given_name'],
				'email': p['email'],
				'affiliation': link_id(organisation)
			}
		} else {
			// warn for unidentifiable creators
			return undefined;
		}
	})
	const nc = creators.filter((x) => x)
	return nc;
}

// make related-publication etc links
// this returns an array of arrays of items, which will be _.flattened
// when the graph is populated

// note that these are very simple - just URI, title and description
// FIXME this assumes that there's a URI which can be used as the
// @id

function make_related_works(datapub) {
	return RELATED.map((thing) => {
		const field = 'related_' + thing;
		var values = [];
		if( field in datapub ) {
			values = datapub[field];
		}
		return values.filter((f) => f['related_url']).map((item) => {
			const ld_item = {
				'@id': item['related_url'],
				'@type': RELATED_TYPES[thing],
				'identifier': item['related_url'],
				'name': item['related_title'],
			};
			if( item['related_notes']) {
				ld_item['description'] = item['related_notes'];
			}
			if( item['related_notes']) {
				ld_item['description'] = item['related_notes'];
			}
			return ld_item; 
		})
	})
}


// Both grants (as projects) and funding bodies are represented
// in DataCrates as Organizations with the relationship 'funded'
// TODO - can we do chaining as per the spec?
// TODO - bring the project in from the RDMP?

function make_funders(datapub, prefix) {
	var f = [];
	return FUNDING_FIELDS.map((field) => {
		if( datapub[field] ) {
			const funders = datapub[field].filter((forg) => forg['dc_identifier']);
			return funders.map((forg) => {
				const id = prefix + forg['dc_identifier'][0];
				return {
					'@id': id,
					'identifier': id,
					'@type': 'Organization',
					'name': forg['dc_title']
				}
			});
		} else {
			return [];
		}
	});

}

// note - keywords are added on the dataset item

function make_subjects(datapub, prefixes) {
	return SUBJECT_FIELDS.map((field) => {
		if( datapub[field] ) {
			return datapub[field].map((cat) => {
				const id = prefixes[field] + cat['notation'];
				return {
					'@id': id,
					'@type': 'StructuredValue',
					'url': id,
					'identifier': id,
					'name': cat['name'] 
				};
			})
		} else {
			return [];
		}
	});
}

// spatialCoverage

function make_spatial(datapub, prefix) {
	var items = [];
	var i = 1;
	if( datapub['geolocations'] ) {
		const geols = datapub['geolocations'].filter((g) => !_.isEmpty(g));
		items = geols.map((gl) => {
			const id = prefix + String(i);
			i += 1;
			return {
				'@id': id,
				'identifier': id,
				'@type': 'Place',
				'name': gl['basic_name'],
				'geo': {
					'@type': "GeoCoordinates",
					'latitude': gl['latitude'],
					'longitude': gl['longitude'] 
				}
			}
		});
	}
	if( datapub['geospatial'] && datapub['geospatial']['type'] ) {
		// these are GeoJSON, for which there is a JSON-LD context
		// defined here: http://geojson.org/geojson-ld/
		// Upstairs, the code that calls this checks for a geospatial
		// item in the datapub and adds the GeoJSON context to the
		// document before trimming it - see l 177
		const id = prefix + String(i);
		items.push({
			'@id': id,
			'identifier': id,
			'@type': datapub['geospatial']['type'],
			'name': id,
			'features': datapub['geospatial']['features']
		})
	}
	return items;
}

// temporal_coverage is not a linked item but a Text or 
// DateTime property on the dataset

// this merges all of the datapub time properties as follows:
// startDate
// startDate/endDate
// endDate
// timePeriod
// startDate; timePeriod .. etc

function make_temporal(datapub) {
	var tc = '';
	if( datapub['startDate'] ) {
		tc = datapub['startDate'];
		if( datapub['endDate'] ) {
			tc += '/' + datapub['endDate'];
		}
	} else if ( datapub['endDate'] ) {
		tc = datapub['endDate'];
	}
	if( datapub['timePeriod'] ) {
		if( tc ) {
			tc = tc + '; ' + datapub['timePeriod'];
		} else {
			tc = datapub['timePeriod'];
		}
	}
	return tc;
}


// this assumes that the license_other supersedes
// the license_statement


function make_license(datapub, prefix) {
	const licenses = [];
	if( datapub['license_other_url'] || datapub['license_notes'] ) {
		if( datapub['license_other_url'] ) {
			licenses.push({
				'@id': datapub['license_other_url'],
				'@type': 'CreativeWork',
				'url': datapub['license_other_url'],
				'name': ( datapub['license_notes'] || datapub['license_other_url'])
			});
		} else {
			licenses.push({
				'@id': DEFAULT_IRI_PREFS['licenses'] + 'other',
				'@type': 'CreativeWork',
				'name': datapub['license_notes']
			});
		}
	}
	if( datapub['license_identifier'] ) {
		licenses.push({
			'@id': datapub['license_identifier'],
			'@type': 'CreativeWork',
			'name': datapub['license_identifier'],
			'url': datapub['license_identifier']
		});
	}
	return licenses;
}

module.exports = {
	'datapub2catalog': datapub2catalog
};