I have a dynamic string that is generated like one of the following:
var q = "FROM Table SELECT avg(1), avg(2), avg(3) where x='y'
var q = "SELECT avg(1), avg(2), avg(3) FROM Table where z='x' since x days ago
The values after the select are also dynamic where there could be 1 select option, or 10. I'm trying to create some logic to always pluck whatever is selected into an array, but having trouble dealing with the dynamic nature (string being constructed dynamically AND the # of selects being dynamic).
Basically, end result something like this:
['avg(1)', 'avg(2)', 'avg(3)']
Currently I'm doing something like the following, but it always expects the string to be formatted in a certain order (always starting with SELECT and where after the fields to pluck):
let splitQ = q.match(".*SELECT(.*)where");
let selects = splitQ[1].trim().split(",");
Here is a working solution.
It makes these assumptions about the query (after lowercased).
the values come after the first instance of the word 'select '
if the query starts with 'from', values end before the first instance of ' where'
if the query starts with 'select', values end before the first instance of ' from'
const test1 = "FROM Table SELECT avg(1), avg(2), avg(3) where x='y'";
const test2 = "SELECT avg(1), avg(2), avg(3) FROM Table where z='x' since x days ago";
function extractValues(query) {
// in both scenarios, the values always come directly after 'select '
const valuesComeAfterMe = 'select ';
query = query.toLowerCase();
let valuesEndBeforeMe;
// conditionally handle both query syntaxes
if (query.startsWith('from')) {
valuesEndBeforeMe = ' where';
} else if (query.startsWith('select')) {
valuesEndBeforeMe = ' from';
} else {
throw Error('query not handled');
}
// remove start
query = query.slice(query.indexOf(valuesComeAfterMe) + valuesComeAfterMe.length);
// remove end
query = query.slice(0, query.indexOf(valuesEndBeforeMe));
// split values and trim whitespace
return query.split(',').map(item => item.trim());
}
console.log(extractValues(test1));
console.log(extractValues(test2));
This question already has answers here:
node-postgres: how to execute "WHERE col IN (<dynamic value list>)" query?
(7 answers)
Closed 2 years ago.
I have to use an SQL query like the one below in a Node.js app.
SELECT * FROM my_table
WHERE my_column IN ['name1','name2']
The array ['name1', 'name2'] is inputted by the user. And it may contain more than 2 names sometimes. How do I format this query using the pg-format package to avoid SQL injection?
Aren't the IN clause arguments supposed to be wrapped using parentheses?
Anyway, here's an example on formatting using pg-format,
var format = require('pg-format');
var sql = format("SELECT * FROM my_table WHERE my_column IN (%L, %L)", 'Alice', 'Bob'); // name1, name2
console.log(sql);
Edit 1:
With dynamic names using an array,
var format = require('pg-format');
var names = ['Alice', 'Bob', 'Charlie'];
var inCaluseStr = '(' + Array(names.length).fill('%L').join(',') + ')';
var sql = format.withArray("SELECT * FROM my_table WHERE my_column IN " + inCaluseStr, names);
console.log(sql);
I hope this helps.
I want to pass a table as parameter on an ajax callback procedure in Oracle APEX 5, because I need to make an SQL query on that table.
The SQL process is stored as shared component inside the Apex 5 application. Screenshot
My procedure is like this
(procedure name: THIS_PROCESS)
declare
v_tablename varchar(128);--max table_name lenght
v_ID number;
v_somevar
BEGIN
SELECT Columname,
INTO v_somevar
FROM v_tablename
WHERE ID = v_ID;
--Do stuff
END;
This code (FROM v_tablename) gives me a compilation error:
ORA-00942: table or view does not exist ORA-06550: line 9, column 5:
PL/SQL: SQL Statement ignored
I'm a total newbie. I had been reading that I should call that procedure with this javascript:
apex.server.process ( "THIS_PROCESS", {
x01: "TABLENAME",
x02: "Row_ID",
pageItems: "#P1_Item,#P2_Item"
},{
success: function( pData )
// do something here
}
} );
I do not understand why I should pass x01 and x02 instead of v_tablename and v_ID
Do x01 and x02 automatically are assigned to v_tablename and v_ID?
Here's an example page process THIS_PROCESS of type "Ajax Callback". Note that you need Dynamic SQL to select from a table name that isn't hardcoded.
declare
v_table varchar2(128) := apex_application.g_x01;
v_id number := apex_application.g_x02;
v_somevar varchar2(100);
v_sql varchar2(4000);
begin
-- validate v_table parameter to avoid sql injection. will throw exception if it fails
select table_name into v_table from all_tables where table_name = v_table;
v_sql := 'SELECT Columname
FROM ' || v_table || '
WHERE ID = :A1';
execute immediate v_sql into v_somevar using v_id;
-- do something with v_somevar
end;
Do be careful with this sort of thing - this design will allow a malicious user to write their own javascript function which can pass any table name that it likes to your procedure.
You need to use dynamic sql:
declare
v_tablename varchar(128);--max table_name lenght
v_sql varchar2(1000);
v_ID number;
v_somevar varchar2(100);
BEGIN
v_sql := 'SELECT Columname FROM ' || v_tablename || ' where ID = :1';
EXECUTE IMMEDIATE v_sql INTO v_somevar USING v_ID;
--Do stuff
END;
/
In my Cordova app, I need to query a SQLite database and select rows where the value of the column EventName contains a substring. I want to be able to use ? to hold values to avoid SQL injection. I tried this query:
SELECT * FROM EventName WHERE 1 = 1 AND lower(EventName) LIKE lower('%?%');
This is my JavaScript code that I use to query the database:
function searchEvent(onSearch, eventName) {
// First create the query string
var params = [];
var query = "SELECT * FROM Event WHERE 1 = 1";
if (eventName != null && eventName != "") {
query += " AND lower(EventName) LIKE lower('%?%')";
params.push(eventName);
}
query += ";";
console.log(query); // Log the query
console.log(params); // Log the parameters
// Then execute query statement
db.transaction(function(tx) {
tx.executeSql(query, params, function(tx, rs) {
onSearch(rs);
});
}, function(err) {
console.log(err); // This statement was executed
});
}
This is the logged query:
SELECT * FROM Event WHERE 1 = 1 AND lower(EventName) LIKE
lower('%?%');
This is the logged paramaters:
[ 'myInput' ]
This is the error the was returned:
{
code: 5,
messsage: 'number of \'?\'s in statement string does not match argument count'
}
As you can see there is 1 ? placeholder and 1 input parameter so the numbers DO match. I think it is because the ? is between the single quotes ('') so it is thought to be a part of the searched string. How do I fix this?
EDIT:
The JavaScript statement "SELECT * FROM Event WHERE 1 = 1" + " AND lower(EventName) LIKE lower('%" + eventName + "%')" is ok, but I wanna use a method that can protect me against SQL injection
In order to prevent the eventName from SQL injection, check it with regEx validation to include only alphanumneric and whitelist specific special characters /^[ A-Za-z0-9_#./#&+-]*$/. Also try this regEx /^[a-zA-Z0-9!##\$%\^\&*)(+=._-]+$/g. I do not believe that ? would work with SQL SELECT statement, so you need to pass +eventname+
Hope it helps.
I'm wondering if anyone knows of a way to measure string similarity in BigQuery.
Seems like would be a neat function to have.
My case is i need to compare the similarity of two urls as want to be fairly sure they refer to the same article.
I can find examples using javascript so maybe a UDF is the way to go but i've not used UDF's at all (or javascript for that matter :) )
Just wondering if there may be a way using existing regex functions or if anyone might be able to get me started with porting the javascript example into a UDF.
Any help much appreciated, thanks
EDIT: Adding some example code
So if i have a UDF defined as:
// distance function
function levenshteinDistance (row, emit) {
//if (row.inputA.length <= 0 ) {var myresult = row.inputB.length};
if (typeof row.inputA === 'undefined') {var myresult = 1};
if (typeof row.inputB === 'undefined') {var myresult = 1};
//if (row.inputB.length <= 0 ) {var myresult = row.inputA.length};
var myresult = Math.min(
levenshteinDistance(row.inputA.substr(1), row.inputB) + 1,
levenshteinDistance(row.inputB.substr(1), row.inputA) + 1,
levenshteinDistance(row.inputA.substr(1), row.inputB.substr(1)) + (row.inputA[0] !== row.inputB[0] ? 1 : 0)
) + 1;
emit({outputA: myresult})
}
bigquery.defineFunction(
'levenshteinDistance', // Name of the function exported to SQL
['inputA', 'inputB'], // Names of input columns
[{'name': 'outputA', 'type': 'integer'}], // Output schema
levenshteinDistance // Reference to JavaScript UDF
);
// make a test function to test individual parts
function test(row, emit) {
if (row.inputA.length <= 0) { var x = row.inputB.length} else { var x = row.inputA.length};
emit({outputA: x});
}
bigquery.defineFunction(
'test', // Name of the function exported to SQL
['inputA', 'inputB'], // Names of input columns
[{'name': 'outputA', 'type': 'integer'}], // Output schema
test // Reference to JavaScript UDF
);
Any i try test with a query such as:
SELECT outputA FROM (levenshteinDistance(SELECT "abc" AS inputA, "abd" AS inputB))
I get error:
Error: TypeError: Cannot read property 'substr' of undefined at line 11, columns 38-39
Error Location: User-defined function
It seems like maybe row.inputA is not a string perhaps or for some reason string functions not able to work on it. Not sure if this is a type issue or something funny about what utils the UDF is able to use by default.
Again any help much appreciated, thanks.
Ready to use shared UDFs - Levenshtein distance:
SELECT fhoffa.x.levenshtein('felipe', 'hoffa')
, fhoffa.x.levenshtein('googgle', 'goggles')
, fhoffa.x.levenshtein('is this the', 'Is This The')
6 2 0
Soundex:
SELECT fhoffa.x.soundex('felipe')
, fhoffa.x.soundex('googgle')
, fhoffa.x.soundex('guugle')
F410 G240 G240
Fuzzy choose one:
SELECT fhoffa.x.fuzzy_extract_one('jony'
, (SELECT ARRAY_AGG(name)
FROM `fh-bigquery.popular_names.gender_probabilities`)
#, ['john', 'johnny', 'jonathan', 'jonas']
)
johnny
How-to:
https://medium.com/#hoffa/new-in-bigquery-persistent-udfs-c9ea4100fd83
If you're familiar with Python, you can use the functions defined by fuzzywuzzy in BigQuery using external libraries loaded from GCS.
Steps:
Download the javascript version of fuzzywuzzy (fuzzball)
Take the compiled file of the library: dist/fuzzball.umd.min.js and rename it to a clearer name (like fuzzball)
Upload it to a google cloud storage bucket
Create a temp function to use the lib in your query (set the path in OPTIONS to the relevant path)
CREATE TEMP FUNCTION token_set_ratio(a STRING, b STRING)
RETURNS FLOAT64
LANGUAGE js AS """
return fuzzball.token_set_ratio(a, b);
"""
OPTIONS (
library="gs://my-bucket/fuzzball.js");
with data as (select "my_test_string" as a, "my_other_string" as b)
SELECT a, b, token_set_ratio(a, b) from data
Levenshtein via JS would be the way to go. You can use the algorithm to get absolute string distance, or convert it to a percentage similarity by simply calculating abs(strlen - distance / strlen).
The easiest way to implement this would be to define a Levenshtein UDF that takes two inputs, a and b, and calculates the distance between them. The function could return a, b, and the distance.
To invoke it, you'd then pass in the two URLs as columns aliased to 'a' and 'b':
SELECT a, b, distance
FROM
Levenshtein(
SELECT
some_url AS a, other_url AS b
FROM
your_table
)
Below is quite simpler version for Hamming Distance by using WITH OFFSET instead of ROW_NUMBER() OVER()
#standardSQL
WITH Input AS (
SELECT 'abcdef' AS strings UNION ALL
SELECT 'defdef' UNION ALL
SELECT '1bcdef' UNION ALL
SELECT '1bcde4' UNION ALL
SELECT '123de4' UNION ALL
SELECT 'abc123'
)
SELECT 'abcdef' AS target, strings,
(SELECT COUNT(1)
FROM UNNEST(SPLIT('abcdef', '')) a WITH OFFSET x
JOIN UNNEST(SPLIT(strings, '')) b WITH OFFSET y
ON x = y AND a != b) hamming_distance
FROM Input
I did it like this:
CREATE TEMP FUNCTION trigram_similarity(a STRING, b STRING) AS (
(
WITH a_trigrams AS (
SELECT
DISTINCT tri_a
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(a), ''), [3,3])) AS tri_a
),
b_trigrams AS (
SELECT
DISTINCT tri_b
FROM
unnest(ML.NGRAMS(SPLIT(LOWER(b), ''), [3,3])) AS tri_b
)
SELECT
COUNTIF(tri_b IS NOT NULL) / COUNT(*)
FROM
a_trigrams
LEFT JOIN b_trigrams ON tri_a = tri_b
)
);
Here is a comparison to Postgres's pg_trgm:
select trigram_similarity('saemus', 'seamus');
-- 0.25 vs. pg_trgm 0.272727
select trigram_similarity('shamus', 'seamus');
-- 0.5 vs. pg_trgm 0.4
I gave the same answer on How to perform trigram operations in Google BigQuery?
I couldn't find a direct answer to this, so I propose this solution, in standard SQL
#standardSQL
CREATE TEMP FUNCTION HammingDistance(a STRING, b STRING) AS (
(
SELECT
SUM(counter) AS diff
FROM (
SELECT
CASE
WHEN X.value != Y.value THEN 1
ELSE 0
END AS counter
FROM (
SELECT
value,
ROW_NUMBER() OVER() AS row
FROM
UNNEST(SPLIT(a, "")) AS value ) X
JOIN (
SELECT
value,
ROW_NUMBER() OVER() AS row
FROM
UNNEST(SPLIT(b, "")) AS value ) Y
ON
X.row = Y.row )
)
);
WITH Input AS (
SELECT 'abcdef' AS strings UNION ALL
SELECT 'defdef' UNION ALL
SELECT '1bcdef' UNION ALL
SELECT '1bcde4' UNION ALL
SELECT '123de4' UNION ALL
SELECT 'abc123'
)
SELECT strings, 'abcdef' as target, HammingDistance('abcdef', strings) as hamming_distance
FROM Input;
Compared to other solutions (like this one), it takes two strings (of the same length, following the definition for hamming distance) and outputs the expected distance.
bigquery similarity standardsql hammingdistance
While I was looking for the answer Felipe above, I worked on my own query and ended up with two versions, one which I called string approximation and another string resemblance.
The first is looking at the shortest distance between letters of source string and test string and returns a score between 0 and 1 where 1 is a complete match. It will always score based on the longest string of the two. It turns out to return similar results to the Levensthein distance.
#standardSql
CREATE OR REPLACE FUNCTION `myproject.func.stringApproximation`(sourceString STRING, testString STRING) AS (
(select avg(best_result) from (
select if(length(testString)<length(sourceString), sourceoffset, testoffset) as ref,
case
when min(result) is null then 0
else 1 / (min(result) + 1)
end as best_result,
from (
select *,
if(source = test, abs(sourceoffset - (testoffset)),
greatest(length(testString),length(sourceString))) as result
from unnest(split(lower(sourceString),'')) as source with offset sourceoffset
cross join
(select *
from unnest(split(lower(testString),'')) as test with offset as testoffset)
) as results
group by ref
)
)
);
The second is a variation of the first, where it will look at sequences of matching distances, so that a character matching at equal distance from the character preceding or following it will count as one point. This works quite well, better than string approximation but not quite as well as I would like to (see example output below).
#standarSql
CREATE OR REPLACE FUNCTION `myproject.func.stringResemblance`(sourceString STRING, testString STRING) AS (
(
select avg(sequence)
from (
select ref,
if(array_length(array(select * from comparison.collection intersect distinct
(select * from comparison.before))) > 0
or array_length(array(select * from comparison.collection intersect distinct
(select * from comparison.after))) > 0
, 1, 0) as sequence
from (
select ref,
collection,
lag(collection) over (order by ref) as before,
lead(collection) over (order by ref) as after
from (
select if(length(testString) < length(sourceString), sourceoffset, testoffset) as ref,
array_agg(result ignore nulls) as collection
from (
select *,
if(source = test, abs(sourceoffset - (testoffset)), null) as result
from unnest(split(lower(sourceString),'')) as source with offset sourceoffset
cross join
(select *
from unnest(split(lower(testString),'')) as test with offset as testoffset)
) as results
group by ref
)
) as comparison
)
)
);
Now here is a sample of result:
#standardSQL
with test_subjects as (
select 'benji' as name union all
select 'benjamin' union all
select 'benjamin alan artis' union all
select 'ben artis' union all
select 'artis benjamin'
)
select name, quick.stringApproximation('benjamin artis', name) as approxiamtion, quick.stringResemblance('benjamin artis', name) as resemblance
from test_subjects
order by resemblance desc
This returns
+---------------------+--------------------+--------------------+
| name | approximation | resemblance |
+---------------------+--------------------+--------------------+
| artis benjamin | 0.2653061224489796 | 0.8947368421052629 |
+---------------------+--------------------+--------------------+
| benjamin alan artis | 0.6078947368421053 | 0.8947368421052629 |
+---------------------+--------------------+--------------------+
| ben artis | 0.4142857142857142 | 0.7142857142857143 |
+---------------------+--------------------+--------------------+
| benjamin | 0.6125850340136053 | 0.5714285714285714 |
+---------------------+--------------------+--------------------+
| benji | 0.36269841269841263| 0.28571428571428575|
+----------------------------------------------------------------
Edited: updated the resemblance algorithm to improve results.
Try Flookup for Google Sheets... it's definitely faster than Levenshtein distance and it calculates percentage similarities right out of the box.
One Flookup function you might find useful is this:
FUZZYMATCH (string1, string2)
Parameter Details
string1: compares to string2.
string2: compares to string1.
The percentage similarity is then calculated based on these comparisons. Both parameters can be ranges.
I'm currently trying to optimise it for large data sets so you feedback would be very welcome.
Edit: I'm the creator of Flookup.