Transformation which split document and insert as multiple - javascript

The only way i found to do it through MLCP, where you transformation has to return an array of format [{"uri": "some_uri1", "value": "newly_created_doc1"},{"uri": "some_uri2", "value": "newly_created_doc2"}]. This way after applying transformation with content pump there will be created these two documents from let's say some original document. My question is how to achieve the same result via node.js api or java api. For example with node.js I am able to apply transformation and it is creating this file as a single document with array of the style above.
My transformation:
const common = require("/ext/home/common.sjs");
function createDocs(content, context) {
var bets = content.value.xpath('//selections').toArray();
var documents = [];
for (var bet in bets) {
var bookie_name = "Boylesports";
var sport_name = "Football";
var event_name = bets[bet].xpath('../../../../name').toString();
if (/^Over \d+\.\d+$/.test(event_name)) {
event_name = event_name.replace(/^Over (\d+)\.\d+$/, 1);
} else {
event_name;
}
var subevent_name = bets[bet].xpath('../../../../name').toString();
if (/^Over \d+\.\d+$/.test(subevent_name)) {
subevent_name = subevent_name.replace(/^Over (\d+)\.\d+$/, 1);
}
else {
subevent_name;
}
var subevent_id = bets[bet].xpath('../../../../idfoevent');
var start_date = xdmp.parseDateTime("[Y0001]-[M01]-[D01]T[h01]:[m01]:[s01]", bets[bet].xpath('../../../../tsstart'));
// var start_date = "2017-10-21T13:00:00Z";
var market_name = bets[bet].xpath('../../name').toString();
if (/^Handicap.+$/.test(market_name)) {
market_name = market_name.replace(/^Handicap.+$/, "Handicaps");
}
if (/^Match Betting$/.test(market_name)) {
market_name = market_name.replace(/^Match Betting$/, "Win Market");
}
else {
market_name;
}
var market_id = bets[bet].xpath('../../idfomarket');
var bet_name = bets[bet].xpath('name').toString();
if (/^Aston Villa$/.test(bet_name)) {
bet_name = bet_name.replace(/^Aston (Villa)$/, 1);
}
else {
bet_name;
}
var bet_id = bets[bet].xpath('idfoselection');
//Push to the list of documents
var document = {};
document.uri = common.convertToSlug(sport_name) + '/' + common.convertToSlug(event_name) + '/' + common.convertToSlug(subevent_name) + '/' + common.convertToSlug(market_name) + '/' + common.convertToSlug(bet_name);
document.value = {
'bookie_name': bookie_name,
'sport_name': sport_name,
'sport_slug': common.convertToSlug(sport_name),
'event_name': event_name,
'event_slug': common.convertToSlug(event_name),
'subevent_name': subevent_name,
'subevent_slug': common.convertToSlug(subevent_name),
'subevent_id': subevent_id,
'start_date': start_date,
'market_name': market_name,
'market_slug': common.convertToSlug(market_name),
'market_id': market_id,
'bet_name': bet_name,
'bet_slug': common.convertToSlug(bet_name),
'bet_id': bet_id
};
//Checks if objects with the same uri exist before pushing them
if (documents.findIndex(x => x.uri == document.uri) === -1) documents.push(document);
// documents.push(document);
}
return Sequence.from(documents);
};
exports.transform = createDocs;
My use in node.js:
const axios = require("axios");
const db_client = require("../../database/connect").dbWriter;
axios.get("http://cache.boylesports.com/feeds/EUROFOOT2.json")
.then(function (response) {
console.log(response.data);
var documents = [{uri: "http://cache.boylesports.com/feeds/EUROFOOT2.json",
content: response.data,
contentType: 'application/json',
collections: ["test"]}];
db_client.documents.write({documents: documents, transform: ['js-example']}).result(
function (response) {
response.documents.forEach(function (document) {
console.log('LOADED: ' + document.uri);
});
},
function (error) {
console.log(error);
}
);
})
.catch(function (error) {
console.log(error);
});

The Client API relies on REST transforms, that do not support the same kind of splitting into multiple documents. I was hoping to say that you could use DMSDK, but it looks like that relies on the same kind of transforms. Here an example that transforms documents after uploading into the database:
http://docs.marklogic.com/guide/java/data-movement#id_22702
Looks like you would have to go against the recommendation, and create a transform that would cause side-effects by doing xdmp:document-insert calls directly.
HTH!

Related

Fetch operation is a lot slower when calling it from a JavaScript code then when calling it from react-native

So I have developed an app that crawl data from a some html sites.
I wanted to keep the parser code separated from the app. So when I want to make changes to a parser I wont have to republish my app. So I could have my parsers JS files on Github for example and then let my app download and parse those parsers.
I have been able to accomplish that but then I got a weird problem. When calling fetch from the code in react-native(typescript) is a lot faster then when I call those fetch operation from the JavaScript.
Why is that? I am using the same HttpClient object the different is that I am calling it from different place.
Here is one of the Parser
Here is the parser translator.
private async loadParser() {
try {
this.script = await HttpClient.GetText(this.onlineParserURI);
if (this.script && this.script.length >= 5) {
this.searchFunction = new Function(...this.parameter, "filter", "page", this.script + "\n return search(filter, page);");
this.getNovelFunction = new Function(...this.parameter, "novelUrl", this.script + "\n return getNovel(novelUrl);");
this.latestFunction = new Function(...this.parameter, "page", this.script + "\n return latest(page);");
this.getChapterFunction = new Function(...this.parameter, "url", this.script + "\n return getChapter(url);");
var detaliFunc = new Function(...this.parameter, this.script + "\n return parserDetali();");
var item = detaliFunc(...this.types); // the constructor eg name, url etc
this.url = item.url;
this.latestUrl = item.latestUrl;
this.searchUrl = item.searchUrl;
this.id = item.id;
this.parserLanguage = item.parserLanguage;
this.name = item.name;
this.panination = item.panination;
this.searchPagination = item.searchPagination;
this.icon = item.icon;
this.parserSearchSettings = item.parserSearchSettings;
this.detaliItemType = item.detaliItemType;
this.onlineDefaultFilter = item.defaultFiter;
this.onlineSections = item.sections;
Object.assign(this, item); // add the extra fields
this.state = "ready";
}
else
this.state = "failed";
} catch (error) {
console.log(error);
this.state = "failed";
}
return this;
}
And here is how I call the methods in the parser
async latest(page: number): Promise <LightItem[]> {
try {
if (this.latestFunction)
return await this.latestFunction(...this.types, page);
} catch (error) {
console.log(error);
}
return [];
}
Here is the default types and parameters that I import
this.types = [this,
Chapter,
LightItem,
Filter,
DetaliItem,
ParserSearchSettings,
labelValue,
DetaliItemType,
Section,
NovelReviews,
HttpClient,
]
this.parameter: Array <string> = [
"parser",
"Chapter",
"LightItem",
"Filter",
"DetaliItem",
"ParserSearchSettings",
"labelValue",
"DetaliItemType",
"Section",
"NovelReviews",
"HttpClient"
];
This is the method that takes to long
There is 10 calls, the first 3 calls are fast then it gets slow for some reasion.
note: the methods (await HttpClient.getHtml(url)) get html is what taking long, it gets stuck for sometimes
async function getChapters(novelUrl) {
var page = 1;
var result = [];
while (page > 0) {
var url = novelUrl + "/page-" + page;
var container = parser.jq(await HttpClient.getHtml(url)).find(".chapter-list a");
if (!items.hasElements()) {
page = 0;
break;
}
var resultA = items.map((x) => {
var ch = new Chapter(x.attr("title").text(), x.attr("href").url());
if (result.find((a) => a.chapterUrl == ch.chapterUrl && a.name == ch.name) === undefined)
return ch;
return undefined;
}, [])
if (resultA.length <= 0) {
page = 0;
break;
}
resultA.forEach((x) => {
result.push(x);
});
page++;
}
return result;
}
Any Idea why this is happening?

Is it possible to use Break to jump out of or skip crawling function?

Hi i would like to know if it is possible use "break" or any other method to skip the crawling function after running the script for the first time and just use the array variable that contains the crawling information for the next user request.
let glossary = [];
/** Initialise crawling and save it into glossary array */
function init() {
const glossary_url = 'https://xxxx';
const headers = {
cookie: 'cookies:kdkjslkd424'
};
const options = {
url: glossary_url,
method: 'GET',
headers: headers
};
request(options, function (error, response, body) {
const newDom = new jsdom(body);
const $ = require('jquery')(newDom.window);
$('ul > li > span[style="line-height:1.6;"]').each(function (index, element) {
let text = element.textContent.trim(); // Get text from html
let splitText = text.split(' = '); // split text by =
//console.log(text);
if (splitText.length > 1) {
glossary.push({
key: splitText[0].trim(),
value: splitText[1],
value2: splitText[2]
});
}
});
//console.log(glossary);
findMatch('DPDL');
});
}
break init;
function findMatch (key){
for(i = 0; i < glossary.length ; i++) {
if (glossary[i].key === key){
// console.log (glossary[i].value );
// console.log(glossary[i].key);
// console.log(glossary[i].value2);
// console.log(key);
console.log(key + ' = ' + glossary[i].value + ' ' + glossary[i].value2 );
}
}
}
init();
break or skip the crawl function if user wants to search another value it will just find in the glossary array glossary = [] and not crawl again as it takes long time

indexOf() doesn't work

I got an array of strings and I'd like to check if any of these strings contain a certain substring.
The following code is an example of what I need to achieve: the function gets a message from an user and searches through a series of URLs the one that contains the user's message in the title:
var redditSubModule = "node";
var http = require('https');
var url = "https://www.reddit.com/r/soccerstreams/new/.json";
var i = 0;
var subject = (ctx.message.text).substring(8);
var request = http.get(url, function(response) {
var json = '';
response.on('data', function(chunk) {
json += chunk;
});
response.on('end', function() {
var redditResponse = JSON.parse(json);
redditResponse.data.children.forEach(function(child) {
if(child.data.domain !== 'self.node') {
url = child.data.url;
console.log('URL : ' + url);
if (url.indexOf(subject) > -1) {
console.log('URL : ' + url);
} else {
console.log("nothing...");
}
i++;
}
});
})
});
request.on('error', function(err) {
console.log(err);
});
If you try and fill subject with "USA" (because there's a thread with that string in the title) the indexOf doesn't seem to work and it prints a list of "nothing..."
Logging the typeof url gives string as type so I don't know what is going on here...
What am I doing wrong ?
You can use array.prototype.find and array.prototype.includes:
var strs = ["aaa", "xxx", "yyy_leon"];
var search = "leon";
var found = strs.find(s => s.includes(search));
if (found) {
console.log("found: ", found);
} else {
console.log("nothing");
}

NodeJS Loop issue due to async/synchronicity issues

I am porting an old ruby script over to use javascript setting the function as a cron instance so it will run on schedule. The function queries our mysql database and retrieves inventory information for our products and then sends requests to a trading partners api to update our inventory on their site.
Due to nodes a-synchronicity I am running into issues. We need to chunk requests into 1000 items per request, and we are sending 10k products. The issue is each request is just sending the last 1000 items each time. The for loop that is inside the while loop is moving forward before it finishes crafting the json request body. I tried creating anon setTimeout functions in the while loop to try and handle it, as well as creating an object with the request function and the variables to be passed and stuffing it into an array to iterate over once the while loop completes but I am getting the same result. Not sure whats the best way to handle it so that each requests gets the correct batch of items. I also need to wait 3 minutes between each request of 1000 items to not hit the request cap.
query.on('end',()=>{
connection.release();
writeArray = itemArray.slice(0),
alteredArray = [];
var csv = json2csv({data: writeArray,fields:fields}),
timestamp = new Date(Date.now());
timestamp = timestamp.getFullYear() + '-' +(timestamp.getMonth() + 1) + '-' + timestamp.getDate()+ ' '+timestamp.getHours() +':'+timestamp.getMinutes()+':'+timestamp.getSeconds();
let fpath = './public/assets/archives/opalEdiInventory-'+timestamp+'.csv';
while(itemArray.length > 0){
alteredArray = itemArray.splice(0,999);
for(let i = 0; i < alteredArray.length; i++){
jsonObjectArray.push({
sku: alteredArray[i]['sku'],
quantity: alteredArray[i]["quantity"],
overstockquantity: alteredArray[i]["osInv"],
warehouse: warehouse,
isdiscontinued: alteredArray[i]["disc"],
backorderdate: alteredArray[i]["etd"],
backorderavailability: alteredArray[i]["boq"]
});
}
var jsonObject = {
login: user,
password: password,
items: jsonObjectArray
};
postOptions.url = endpoint;
postOptions.body = JSON.stringify(jsonObject);
funcArray.push({func:function(postOptions){request(postOptions,(err,res,body)=>{if(err){console.error(err);throw err;}console.log(body);})},vars:postOptions});
jsonObjectArray.length = 0;
}
var mili = 180000;
for(let i = 0;i < funcArray.length; i++){
setTimeout(()=>{
var d = JSON.parse(funcArray[i]['vars'].body);
console.log(d);
console.log('request '+ i);
//funcArray[i]['func'](funcArray[i]['vars']);
}, mili * i);
}
});
});
You would need async/await or Promise to handle async actions in node js.
I am not sure if you have node version which supports Async/await so i have tried a promise based solution.
query.on('end', () => {
connection.release();
writeArray = itemArray.slice(0),
alteredArray = [];
var csv = json2csv({ data: writeArray, fields: fields }),
timestamp = new Date(Date.now());
timestamp = timestamp.getFullYear() + '-' + (timestamp.getMonth() + 1) + '-' + timestamp.getDate() + ' ' + timestamp.getHours() + ':' + timestamp.getMinutes() + ':' + timestamp.getSeconds();
let fpath = './public/assets/archives/opalEdiInventory-' + timestamp + '.csv';
var calls = chunk(itemArray, 1000)
.map(function(chunk) {
var renameditemsArray = chunk.map((item) => new renamedItem(item, warehouse));
var postOptions = {};
postOptions.url = endpoint;
postOptions.body = JSON.stringify({
login: user,
password: password,
items: renameditemsArray
});
return postOptions;
});
sequenceBatch(calls, makeRequest)
.then(function() {
console.log('done');
})
.catch(function(err) {
console.log('failed', err)
});
function sequenceBatch (calls, cb) {
var sequence = Promise.resolve();
var count = 1;
calls.forEach(function (callOptions) {
count++;
sequence = sequence.then(()=> {
return new Promise(function (resolve, reject){
setTimeout(function () {
try {
cb(callOptions);
resolve(`callsequence${count} done`);
}
catch(err) {
reject(`callsequence ${count} failed`);
}
}, 180000);
});
})
});
return sequence;
}
function makeRequest(postOptions) {
request(postOptions, (err, res, body) => {
if (err) {
console.error(err);
throw err;
}
console.log(body)
});
}
function chunk(arr, len) {
var chunks = [],
i = 0,
n = arr.length;
while (i < n) {
chunks.push(arr.slice(i, i += len));
}
return chunks;
}
function renamedItem(item, warehouse) {
this.sku = item['sku']
this.quantity = item["quantity"]
this.overstockquantity = item["osInv"]
this.warehouse = warehouse
this.isdiscontinued = item["disc"]
this.backorderdate = item["etd"]
this.backorderavailability= item["boq"]
}
});
Could you please try this snippet and let me know if it works?I couldn't test it since made it up on the fly. the core logic is in the sequenceBatch function. the The answer is based on an another question which explains how timeouts and promises works together.
Turns out this wasn't a closure or async issues at all, the request object I was building was using references to objects instead of shallow copies resulting in the data all being linked to the same object ref in the ending array.

Google Custom Search API for NodeJS

I'm trying to get the first 5 pages of search results with google custom search API ...
So far I've tried to achieve the result using nested function but with no luck.
I know that I'm messing with callback but, so far I've not figure out the correct way (without using promises library) to solve my problem.
Could some of you point me out in the right direction?
Thanks.
app.get('/assesment', function(req, res){
console.log('route: /assesment');
var api_key = '';
var customsearch = google.customsearch('v1');
var response = "";
var number_of_pages = 5;
var next_page = 1;
var exit = 0
const CX = 'XXXXX';
const API_KEY = 'XXXXX';
const SEARCH = 'Test Query';
console.log('start');
// console.log('QUERY PAGE: '+pages);
doSearch(CX, SEARCH, API_KEY, next_page, function(resp){
res.send(resp);
});
//
// Functions
//
function doSearch(_cx, _search, _api_key, _start, callback ){
var response = '';
customsearch.cse.list({ cx: _cx, q: _search, auth: _api_key, start: _start }, function (err, resp) {
if (err) {
response = JSON.stringify(err);
} else {
// Got the response from custom search
console.log('Result: ' + resp.searchInformation.formattedTotalResults);
if (resp.items && resp.items.length > 0) {
console.log('First result of '+resp.items.length+' is ' + resp.items[0].title);
for (var i = 0; i < resp.items.length; i++) {
response += resp.items[i].title+"<br>";
response += resp.items[i].link +"<br><hr>";
}
}
res = {
response: response,
next_page: resp.queries.nextPage
}
// res =
}
_start += 1;
if (_start < 6 ) {
doSearch(_cx, _search, _api_key, _start, _start*10+1,
function(resp){
response += resp;
});
}
if (callback && typeof callback === "function") callback(response);
});
};
});
You can use a third-party service like SerpApi to scrape Google and get back structured JSON.
Example using the Node.js library to get 4 page of results:
var gsr = require('GoogleSearchResults')
let serp = new gsr.GoogleSearchResults("demo")
serp.json({
q: "Coffee",
num: 10,
start: 30,
location: "Portland"
}, (result) => {
console.log(result)
})

Categories

Resources