Skip to content
Snippets Groups Projects
Commit 1f5b7497 authored by Mike Lynch's avatar Mike Lynch
Browse files

Rewriting the indexing config so that it's TYPE/FIELD/{config} and faceting

and resolving items are unified. Have rejigged filtering and the filter
tests are passing. Added a test case for empty filters which let everything
through.
parent 4575dca1
Branches
No related merge requests found
......@@ -19,19 +19,16 @@ class CatalogSolr {
setConfig(config) {
this.config = config;
// compile filters
// if( !this.validateConfig() ) {
// return null;
// }
this.filters = {};
_.each(this.config['types'], ( cf, type ) => {
if( 'filter' in cf ) {
this.filters[type] = this.compileFilter(cf['filter'])
} else {
this.filters[type] = () => true;
}
_.each(this.config['types'], ( typecf, type ) => {
const typef = {};
_.each(typecf, ( fieldcf, field ) => {
if( 'filter' in fieldcf ) {
typef[field] = fieldcf['filter'];
}
});
this.filters[type] = this.compileFilter(typef);
});
}
......@@ -256,30 +253,50 @@ class CatalogSolr {
const cfTypes = this.config['types'];
const solrDocument = {};
// loop through each item in the JSON-LD @graph
for ( const item of crate.graph ) {
const type = item['@type'];
if( type in cfTypes ) {
// get config for this type of item
const cf = cfTypes[item['@type']];
// filters are compiled from config when it's loaded
if( this.filters[type](item) ) {
// test if item passes the filters (these were compiled when the
// config was loaded)
if( this.filters[type](item) ) {
// start with the fields which have configured mappings for all items
const solr = this.baseSolr(cfMap, item);
const skip = cf['skip'] || [];
// then map the rest, unless they're in 'skip'
_.each(item, ( value, field ) => {
if( !(field in cfMap) ) {
if( cf.flatten && cf.flatten[field] ) {
solr[field] = this.flatten(cf.flatten[field], value);
if( !(field in cfMap) && !skip.includes(field) ) {
// 'resolve' contains fields where we need to look up
// item(s) in the graph to get information we want in the
// index (like a person or an organisation's name)
if( cf.resolve && cf.resolve[field] ) {
solr[field] = this.resolveValues(crate, cf.resolve[field], value);
} else {
solr[field] = value;
solr[field] = this.unwrap(value);
}
}
});
// TODO - add facets
// A separate loop to look up and add facets, because we may
// need to do different stuff with them even if we've already looked
// them up in the main loop (id lookups in ROCrate are cheap)
_.each(cf['facets'], ( facetcf, facet ) => {
const facetField = facet + facetcf['field_suffix'];
console.log(`resolving facets: ${facet}`);
solr[facetField] = this.resolveFacets(crate, facetcf, item[facet]);
});
if( !(type in solrDocument) ) {
solrDocument[type] = [];
}
console.log(JSON.stringify(solrDocument))
solrDocument[type].push(solr)
}
}
......@@ -290,6 +307,64 @@ class CatalogSolr {
}
resolveValues(crate, cf, value) {
if( cf === 'multi' ) {
return value.map((v) => this.resolveAndFlatten(crate, v));
} else {
return this.resolveAndflatten(graph, value);
}
}
resolveAndFlatten(crate, value) {
if( !('@id' in value ) ) {
return "no @id in value";
}
const resolved = crate.getItem(value['@id']);
if( !resolved ) {
return `@id ${value['@id']} not found`;
}
return JSON.stringify(resolved).replace(/"/g, '\'');
}
// this could be merged with resolveValues, but perhaps
// is better kept separate. I think that the facets need
// to be made available in flattened form, but I don't
// understand the logic of the old code, so for now it's
// this
resolveFacets(crate, cf, value) {
if( cf['tokenize'] ) {
if( value ) {
return value.split(cf['tokenize']['delim']);
} else {
return [];
}
}
if (cf['fieldName']) {
if( Array.array(value) ) {
return value.map((v) => {
const resolved = crate.getItem(v['@id']);
if( resolved ) {
return resolved[cf['fieldName']]
} else {
return null;
}
});
} else {
return [];
}
}
return value;
}
// mappings which are done for all solr records
......@@ -298,50 +373,24 @@ class CatalogSolr {
const base = {};
_.each(map_all, ( targets, field ) => {
_.each(targets, ( target ) => {
base[target] = item[field]
base[target] = this.unwrap(item[field])
});
});
return base;
}
// flatten: take a value which is a JSON object and escape it and stringify it
// so it can get saved in Solr as a literal, not JSON.
// this is used when we want the JSON structure to get pushed through to the
// UI at the Portal for things like Creator.
// unwrap a value if it's in an array
// if the value in config is 'array', then the input is interpreted as an
// array of values, each of which is flattened individually and returned as
// an array - this is what happens for eg with creators
//
// [ { "@id": "creator1" }, { "@id": "creator2" } ]
//
// becomes
//
// [ "{ \'@id\': \'creator1\'}", "{ \'@id\': \'creator2\'}" ]
//
// if not, the value is flattened as a whole, ie:
//
// "[ { \'@id\': \'creator1\'}, { \'@id\': \'creator2\'} ]
flatten_value(cf, value) {
if( cf === 'array' ) {
return value.map(flatten);
unwrap(value) {
if( Array.isArray(value) ) {
return value[0];
} else {
return flatten(value);
return value;
}
}
flatten(value) {
return JSON.stringify(value).replace(/"/g, '\'')
}
createSolrObject(jsonld, rootConfig) {
//Peter's idea is to convert everything into an array then it is safer to work to convert
......
This diff is collapsed.
......@@ -18,14 +18,14 @@
"axios": "^0.18.1",
"chai": "^4.2.0",
"datacrate": "git+https://code.research.uts.edu.au/eresearch/datacrate.git#random_as_library",
"fs-extra": "^8.0.1",
"fs-extra": "^8.1.0",
"lodash": "^4.17.15",
"ocfl": "^1.0.3",
"random-word": "^2.0.0",
"ro-crate": "^1.0.1",
"yargs": "^13.2.4"
"ro-crate": "^1.1.0",
"yargs": "^13.3.0"
},
"devDependencies": {
"mocha": "^6.1.4"
"mocha": "^6.2.0"
}
}
......@@ -5,63 +5,40 @@
},
"types": {
"Dataset": {
"filter": { "path": { "re": "^\\./|data/$"} },
"recordTypeName": "dataset",
"flatten": {
"creator": "array",
"GeoCoordinates": {}
"path": {
"filter": { "re": "^\\./|data/$" }
},
"hasPart": "skip",
"creator": {
"resolve": "multi",
"facet": true
},
"facets": {
"datePublished": {
"field_suffix": "_Dataset_facet",
"trim": true
},
"keywords": {
"trim": true,
"field_suffix": "_Dataset_facetmulti",
"tokenize": {
"delim": ","
}
},
"creator": {
"trim": true,
"fieldName": "name",
"field_suffix": "_Dataset_facetmulti"
},
"publisher": {
"trim": true,
"fieldName": "@id",
"field_suffix": "_Dataset_facet"
},
"type": {
"trim": true,
"field_suffix": "facetmulti",
"skip_entry_type_suffix": true,
"escape_value": "solr_field"
"keywords": {
"facet": {
"tokenize": ","
}
},
"oai-pmh": {
"core": "dataset",
"set": "datasource:ReDBox"
"datePublished": {
"facet": true
},
"publisher": {
"resolve": "single",
"facet": {
"field": "@id"
}
}
},
"Person": {
"reverse": "in the reverse jsonld pointer",
"core": "dataset",
"format": "jsonld",
"recordTypeName": "person",
"flatten": {
"affiliation": "array"
"affiliation": {
"resolve": "multi"
},
"facets": {
"type": {
"trim": true,
"field_suffix": "facetmulti",
"skip_entry_type_suffix": true,
"escape_value": "solr_field"
}
}
}
}
}
\ No newline at end of file
......@@ -24,14 +24,12 @@ function randomGraph(n, type, fields, value_callback) {
});
}
function makeCatalog(spec) {
function makeCatalog(fieldcf) {
const catalog = new CatalogSolr();
catalog.setConfig({
types: {
Dataset: {
filter: spec
}
Dataset: fieldcf
}
});
......@@ -50,12 +48,23 @@ function randomSubstring(word) {
describe('single filters', function () {
this.timeout(TIMEOUT);
it('matches everything when filter is empty', function () {
const graph = randomGraph(GRAPH, 'Dataset', ['path']);
const catalog = makeCatalog({path: {} });
const matches = graph.filter(catalog.filters['Dataset']);
expect(matches).to.be.an('array').and.to.have.lengthOf(GRAPH);
});
it('can pick items by exact matching a single field', function () {
_.times(REPEATS, () => {
const graph = randomGraph(GRAPH, 'Dataset', ['path']);
const item = _.sample(graph);
const lookfor = item['path'][0];
const catalog = makeCatalog({path: lookfor});
const catalog = makeCatalog({path: {filter: lookfor} });
const matches = graph.filter(catalog.filters['Dataset']);
expect(matches).to.be.an('array').and.to.not.be.empty;
_.each(matches, (match) => {
......@@ -70,7 +79,7 @@ describe('single filters', function () {
const graph = randomGraph(GRAPH, 'Dataset', ['path']);
const item = _.sample(graph);
const lookfor = randomSubstring(item['path'][0]);
const catalog = makeCatalog({ path: { re: lookfor } } );
const catalog = makeCatalog({ path: { filter: { re: lookfor } } });
const matches = graph.filter(catalog.filters['Dataset']);
expect(matches).to.be.an('array').and.to.not.be.empty;
const lookfor_re = new RegExp(lookfor);
......@@ -85,7 +94,7 @@ describe('single filters', function () {
_.times(REPEATS, () => {
const graph = randomGraph(GRAPH, 'Dataset', ['path'], () => [ _.sample(['./', 'data/'])] );
const lookfor = "^\\./|data/$";
const catalog = makeCatalog({path: { re: lookfor } } );
const catalog = makeCatalog({path: { filter: { re: lookfor } } } );
const matches = graph.filter(catalog.filters['Dataset']);
expect(matches).to.be.an('array').and.to.have.lengthOf(GRAPH);
const lookfor_re = new RegExp(lookfor);
......@@ -103,7 +112,7 @@ describe('single filters', function () {
const lookfor = item['path'][0];
// add more items without a path
const graph2 = graph.concat(randomGraph(GRAPH, 'Dataset', ['name']));
const catalog = makeCatalog({path: { re: lookfor } } );
const catalog = makeCatalog({path: {filter: { re: lookfor } }});
const matches = graph2.filter(catalog.filters['Dataset']);
const lookfor_re = new RegExp(lookfor);
_.each(matches, (match) => {
......@@ -117,7 +126,7 @@ describe('single filters', function () {
it('can filter values whether or not they are in arrays', function () {
const values = [ [ 'one' ], [ 'two' ], 'one', 'two' ];
const graph = values.map((v) => { return { '@type': 'Dataset', 'path': v }});
const catalog = makeCatalog({path: 'one' });
const catalog = makeCatalog({path: {filter: 'one' }});
const matches = graph.filter(catalog.filters['Dataset']);
expect(matches).to.have.lengthOf(2);
_.each(matches, (match) => {
......@@ -143,15 +152,15 @@ describe('single filters', function () {
function randomFilter(fields, item) {
const n = _.random(2, fields.length);
const ffields =_.sampleSize(fields, n);
const filter = {};
const filters = {};
_.each(ffields, (ff) => {
if( _.random(1) === 0 ) {
filter[ff] = item[ff][0]
filters[ff] = { filter: item[ff][0] }
} else {
filter[ff] = { re: randomSubstring(item[ff][0]) }
filters[ff] = { filter: { re: randomSubstring(item[ff][0]) } }
}
});
return filter;
return filters;
}
describe('multiple filters', function () {
......
const assert = require('assert');
const expect = require('chai').expect;
const _ = require('lodash');
const path = require('path');
const fs = require('fs-extra');
......@@ -18,6 +19,8 @@ function jsonRecord(basePath, fileName) {
}
}
// TODO: have this actually test a dataset and some people
describe('create solr object', function () {
const test_data = path.join(process.cwd(), 'test-data');
const fieldsPath = path.join(test_data, 'fields.json');
......@@ -33,6 +36,6 @@ describe('create solr object', function () {
fs.writeFileSync(path.join(test_data, "solr_output.json"), JSON.stringify(solrObject, null, 2));
assert.strictEqual(solrObject['Dataset'][0]['record_format_s'], 'Dataset','Dataset not loaded');
expect(solrObject['Dataset'][0]['record_type_s']).to.equal('Dataset');
});
});
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment