Skip to content
Snippets Groups Projects
Commit 5604b931 authored by Mike Lynch's avatar Mike Lynch
Browse files

Version which chokes on on of the ro-crate samples when trying to convert it

to a solr record
parent 70998061
Branches
No related merge requests found
......@@ -4,21 +4,30 @@ const yargs = require('yargs');
const CatalogSolr = require('./lib/CatalogSolr');
const fs = require('fs-extra');
const path = require('path');
const OCFLRepository = require('ocfl').Repository;
const argv = yargs['argv'];
const configPath = argv.config || './config.json';
if (!fs.existsSync(configPath)) {
console.error(`Please provide a valid config file path: ${configPath}`);
process.exit(1);
}
const configJson = require(configPath);
const sourcePath = _.endsWith(configJson['source'], '/') ? configJson['source'] : `${configJson['source']}/`;
const solrUpdate = configJson['solrUpdate'] || '';
const fieldConfig = require(configJson['fields']);
const logLevel = configJson['logLevel'] || 4;
const waitPeriod = configJson['waitPeriod'] || 0;
const batchNum = configJson['batch'] || 1000;
const catalogFilename = configJson['catalogFilename'] || 'CATALOG.json';
const sourcePath = _.endsWith(configJson['source'], '/') ? configJson['source'] : `${configJson['source']}/`;
const ocflMode = configJson['ocfl'] || false;
const sleep = ms => new Promise((r, j) => {
console.log('Waiting for ' + ms + ' seconds');
......@@ -37,6 +46,7 @@ function commitDocs(solrURL, URI) {
}
function updateDocs(solrURL, coreObjects) {
console.log('updateDocs: ' + solrURL);
return axios({
url: solrURL + '/docs',
method: 'post',
......@@ -48,6 +58,8 @@ function updateDocs(solrURL, coreObjects) {
});
}
// needs to be replaced because OCFL
function jsonRecords(basePath, dirs) {
const records = [];
_.each(dirs, (d) => {
......@@ -62,6 +74,7 @@ function jsonRecords(basePath, dirs) {
return records;
}
function solrObjects(recs) {
let catalog = new CatalogSolr();
catalog.setConfig(fieldConfig);
......@@ -85,40 +98,103 @@ function solrObjects(recs) {
return catalogs;
}
// MAIN APP
let dirs = null;
if (fs.existsSync(sourcePath)) {
dirs = fs.readdirSync(sourcePath).filter(f => fs.statSync(path.join(sourcePath, f)).isDirectory());
} else {
console.error(`Source path doesn't exist: ${sourcePath}`);
process.exit(1);
async function loadFromDirs(root) {
const e = await fs.stat(root);
if( !e ) {
console.error(`Source path doesn't exist: ${sourcePath}`);
process.exit(1);
} else {
console.log(e);
}
const paths = await fs.readdir(sourcePath);
const dirs = [];
for( const p of paths ) {
var s = await fs.stat(p);
if( s.isDirectory() ) {
dirs.push(p);
}
}
return dirs;
}
const batch = _.chunk(dirs, batchNum);
dirs = null;
batch.reduce((promise, p, index) => {
return promise.then(() => {
if (logLevel >= 4) console.log(`Using: ${Math.round(process.memoryUsage().rss / 1024 / 1024 * 100) / 100} MBs`);
const records = jsonRecords(sourcePath, p);
const catalogs = solrObjects(records);
return updateDocs(solrUpdate, catalogs).then(async () => {
if (waitPeriod) {
const waited = await sleep(waitPeriod);
}
console.log(`batch ${index} of ${batch.length} : Update docs`);
if (index >= batch.length - 1) {
console.log('run commit');
return commitDocs(solrUpdate, '?commit=true&overwrite=true').then(() => {
return Promise.resolve();
});
async function loadFromOcfl(repoPath) {
const repo = new OCFLRepository();
await repo.load(repoPath);
const objects = await repo.objects();
const records = [];
for ( const oid of Object.keys(objects)) {
const object = objects[oid];
const inv = await object.getInventory();
var headState = inv.versions[inv.head].state;
for (let hash of Object.keys(headState)){
if (headState[hash].includes(catalogFilename)) {
const jsonfile = path.join(object.path, inv.manifest[hash][0]);
const json = await fs.readJson(jsonfile);
records.push(json);
}
return Promise.resolve();
});
}).catch((e) => {
console.log(e);
})
}, Promise.resolve());
}
}
return records;
}
async function commitBatches (records) {
console.log("updating " + records.length + " records");
const batch = _.chunk(records, batchNum);
batch.reduce((promise, records, index) => {
return promise.then(() => {
if (logLevel >= 4) console.log(`Using: ${Math.round(process.memoryUsage().rss / 1024 / 1024 * 100) / 100} MBs`);
const catalogs = solrObjects(records);
console.log(catalogs);
return updateDocs(solrUpdate, catalogs).then(async () => {
if (waitPeriod) {
const waited = await sleep(waitPeriod);
}
console.log(`batch ${index} of ${batch.length} : Update docs`);
if (index >= batch.length - 1) {
return commitDocs(solrUpdate, '?commit=true&overwrite=true').then(() => {
return Promise.resolve();
});
}
return Promise.resolve();
});
}).catch((e) => {
console.log(e);
})
}, Promise.resolve());
}
async function main () {
var records = null;
if( ocflMode ) {
records = await loadFromOcfl(sourcePath);
} else {
records = await loadFromDirs(sourcePath);
}
console.log("Got " + records.length + " records from " + sourcePath);
await commitBatches(records);
}
main();
{
"source": "/Users/moises/source/peppermint/publication/public/",
"ocfl": "/Users/moises/source/peppermint/publication/ocfl/",
"solrUpdate": "http://localhost:8983/solr/dataset/update/json",
"source": "/Users/mike/working/redbox/ocfl-nginx/test_repos/rocrate_ocfl",
"ocfl": 1,
"dry-run": 1,
"catalogFilename": "ro-crate-metadata.jsonld",
"solrUpdate": "http://localhost:8983/solr/ocflcore/update/json",
"logLevel": 4,
"waitPeriod": 0,
"fields": "./fields.json",
......
......@@ -2,23 +2,31 @@ const yargs = require('yargs');
const randomize = require('datacrate/lib/randomize');
const assert = require('assert');
const fs = require('fs-extra');
const ocfl = require('ocfl');
const argv = yargs['argv'];
assert.notStrictEqual(argv.n, undefined, 'Please include --n as number of datacrates to generate');
const numberOfDatacrates = argv.n;
assert.notStrictEqual(argv.d, undefined, 'Please include --d as directory where datacrates are to be generated');
const datacrateDirPath = argv.d;
const ocfl = argv.o;
const TEMPDIR = './tmp/';
// if putting them in an ocfl repository, write them to a temporary dest/ and then
// check them in (so I don't have to update the datacrate library too much
async function createDatacrates(dest, n) {
const sourcedata = await randomize.loadsourcedata('./node_modules/datacrate/vocabularies');
const datapubs = randomize.randdatapubs(n, sourcedata);
datapubs.reduce((promise, p, index) => {
return promise.then(async () => {
const id = await randomize.makedir(dest);
return randomize.makedatacrate(dest, p, id)
.then(() => {
return createDatacrate(dest, p).then(() => {
if (index >= n) {
console.log("Done");
}
......@@ -28,7 +36,21 @@ async function createDatacrates(dest, n) {
}, Promise.resolve());
}
async function createDatacrate(dest, pub, id) {
if( ocfl ) {
const id = await randomize.makedir(TMP);
await randomize.makedatacrate(TMP, pub, id);
} else {
const id = await randomize.makedir(dest);
return randomize.makedatacrate(dest, p, id)
try {
if( ocfl ) {
fs.ensureDirSync(TEMP);
}
fs.ensureDirSync(datacrateDirPath);
createDatacrates(datacrateDirPath, numberOfDatacrates)
.then(() => {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment