Add strict_word_similarity to pg_trgm module

feodor · feodor · commit be8a7a686627 · 2018-03-21T14:57:42.000+03:00
strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile
@@ -4,11 +4,12 @@ MODULE_big = pg_trgm
 OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
 
 EXTENSION = pg_trgm
-DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
+DATA = pg_trgm--1.3--1.4.sql \
+	pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
 	pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
 PGFILEDESC = "pg_trgm - trigram matching"
 
-REGRESS = pg_trgm pg_word_trgm
+REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pg_trgm/expected/pg_strict_word_trgm.out b/contrib/pg_trgm/expected/pg_strict_word_trgm.out
diff --git a/contrib/pg_trgm/pg_trgm--1.3--1.4.sql b/contrib/pg_trgm/pg_trgm--1.3--1.4.sql
@@ -0,0 +1,68 @@
+/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
+
+CREATE FUNCTION strict_word_similarity(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE FUNCTION strict_word_similarity_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+
+CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+
+CREATE OPERATOR <<% (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_op,
+        COMMUTATOR = '%>>',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OPERATOR %>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_commutator_op,
+        COMMUTATOR = '<<%',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE FUNCTION strict_word_similarity_dist_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE OPERATOR <<<-> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_op,
+        COMMUTATOR = '<->>>'
+);
+
+CREATE OPERATOR <->>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_commutator_op,
+        COMMUTATOR = '<<<->'
+);
+
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        9       %>> (text, text),
+        OPERATOR        10       <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
+
+ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
+        OPERATOR        9       %>> (text, text);
diff --git a/contrib/pg_trgm/pg_trgm.control b/contrib/pg_trgm/pg_trgm.control
@@ -1,5 +1,5 @@
 # pg_trgm extension
 comment = 'text similarity measurement and index searching based on trigrams'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/pg_trgm'
 relocatable = true
diff --git a/contrib/pg_trgm/sql/pg_strict_word_trgm.sql b/contrib/pg_trgm/sql/pg_strict_word_trgm.sql
@@ -0,0 +1,42 @@
+DROP INDEX trgm_idx2;
+
+\copy test_trgm3 from 'data/trgm2.data'
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+
+create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+explain (costs off)
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+
+drop index trgm_idx2;
+create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+set "pg_trgm.strict_word_similarity_threshold" to 0.4;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+set "pg_trgm.strict_word_similarity_threshold" to 0.2;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
@@ -6,6 +6,7 @@
 
 #include "access/gist.h"
 #include "access/itup.h"
+#include "access/stratnum.h"
 #include "storage/bufpage.h"
 
 /*
@@ -26,14 +27,16 @@
 #define DIVUNION
 
 /* operator strategy numbers */
-#define SimilarityStrategyNumber		1
-#define DistanceStrategyNumber			2
-#define LikeStrategyNumber				3
-#define ILikeStrategyNumber				4
-#define RegExpStrategyNumber			5
-#define RegExpICaseStrategyNumber		6
-#define WordSimilarityStrategyNumber	7
-#define WordDistanceStrategyNumber		8
+#define SimilarityStrategyNumber			1
+#define DistanceStrategyNumber				2
+#define LikeStrategyNumber					3
+#define ILikeStrategyNumber					4
+#define RegExpStrategyNumber				5
+#define RegExpICaseStrategyNumber			6
+#define WordSimilarityStrategyNumber		7
+#define WordDistanceStrategyNumber			8
+#define StrictWordSimilarityStrategyNumber	9
+#define StrictWordDistanceStrategyNumber	10
 
 typedef char trgm[3];
 
@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
 
 extern double similarity_threshold;
 extern double word_similarity_threshold;
+extern double strict_word_similarity_threshold;
 
+extern double index_strategy_get_limit(StrategyNumber strategy);
 extern uint32 trgm2int(trgm *ptr);
 extern void compact_trigram(trgm *tptr, char *str, int bytelen);
 extern TRGM *generate_trgm(char *str, int slen);
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
+		case StrictWordSimilarityStrategyNumber:
 			trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
 			break;
 		case ILikeStrategyNumber:
@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			nlimit = index_strategy_get_limit(strategy);
 
 			/* Count the matches */
 			ntrue = 0;
@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			nlimit = index_strategy_get_limit(strategy);
 
 			/* Count the matches */
 			ntrue = 0;
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 		{
 			case SimilarityStrategyNumber:
 			case WordSimilarityStrategyNumber:
+			case StrictWordSimilarityStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
 									 querysize - VARHDRSZ);
 				break;
@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			/* Similarity search is exact. Word similarity search is inexact */
-			*recheck = (strategy == WordSimilarityStrategyNumber);
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			/* Similarity search is exact. (Strict) word similarity search is inexact */
+			*recheck = (strategy != SimilarityStrategyNumber);
+
+			nlimit = index_strategy_get_limit(strategy);
 
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	{
 		case DistanceStrategyNumber:
 		case WordDistanceStrategyNumber:
-			*recheck = strategy == WordDistanceStrategyNumber;
+		case StrictWordDistanceStrategyNumber:
+			/* Only plain trigram distance is exact */
+			*recheck = (strategy != DistanceStrategyNumber);
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
 
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml

-Original file line number
+Diff line change
        the explanation below.
       </entry>
      </row>
 +     <row>
 +      <entry>
 +       <function>strict_word_similarity(text, text)</function>
 +       <indexterm><primary>strict_word_similarity</primary></indexterm>
 +      </entry>
 +      <entry><type>real</type></entry>
 +      <entry>
 +       Same as <function>word_similarity(text, text)</function>, but forces
 +       extent boundaries to match word boundaries.
 +      </entry>
 +     </row>
      <row>
       <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
       <entry><type>real</type></entry>
    a part of the word.
   </para>
 +  <para>
 +   At the same time, <function>strict_word_similarity(text, text)</function>
 +   has to select an extent that matches word boundaries.  In the example above,
 +   <function>strict_word_similarity(text, text)</function> would select the
 +   extent <literal>{"  w"," wo","wor","ord","rds", ds "}</literal>, which
 +   corresponds to the whole word <literal>'words'</literal>.
++
 +<programlisting>
 +# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
 + strict_word_similarity | similarity
 +------------------------+------------
 +               0.571429 |   0.571429
 +(1 row)
 +</programlisting>
 +  </para>
++
 +  <para>
 +   Thus, the <function>strict_word_similarity(text, text)</function> function
 +   is useful for finding similar subsets of whole words, while
 +   <function>word_similarity(text, text)</function> is more suitable for
 +   searching similar parts of words.
 +  </para>
++
   <table id="pgtrgm-op-table">
    <title><filename>pg_trgm</filename> Operators</title>
    <tgroup cols="3">
        Commutator of the <literal>&lt;%</literal> operator.
       </entry>
      </row>
 +     <row>
 +      <entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
 +      <entry><type>boolean</type></entry>
 +      <entry>
 +       Returns <literal>true</literal> if its second argument has a continuous
 +       extent of an ordered trigram set that matches word boundaries,
 +       and its similarity to the trigram set of the first argument is greater
 +       than the current strict word similarity threshold set by the
 +       <varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
 +      </entry>
 +     </row>
 +     <row>
 +      <entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
 +      <entry><type>boolean</type></entry>
 +      <entry>
 +       Commutator of the <literal>&lt;&lt;%</literal> operator.
 +      </entry>
 +     </row>
      <row>
       <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
       <entry><type>real</type></entry>
        Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
       </entry>
      </row>
 +     <row>
 +      <entry>
 +       <type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
 +      </entry>
 +      <entry><type>real</type></entry>
 +      <entry>
 +       Returns the <quote>distance</quote> between the arguments, that is
 +       one minus the <function>strict_word_similarity()</function> value.
 +      </entry>
 +     </row>
 +     <row>
 +      <entry>
 +       <type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
 +      </entry>
 +      <entry><type>real</type></entry>
 +      <entry>
 +       Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
 +      </entry>
 +     </row>
     </tbody>
    </tgroup>
   </table>
   <para>
    Also you can use an index on the <structfield>t</structfield> column for word
 -   similarity.  For example:
 +   similarity or strict word similarity.  Typical queries are:
 <programlisting>
 SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
   FROM test_trgm
   WHERE '<replaceable>word</replaceable>' &lt;% t
   ORDER BY sml DESC, t;
 +</programlisting>
 +   and
 +<programlisting>
 +SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
 +  FROM test_trgm
 +  WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
 +  ORDER BY sml DESC, t;
 </programlisting>
    This will return all values in the text column for which there is a
    continuous extent in the corresponding ordered trigram set that is
   </para>
   <para>
 -   A variant of the above query is
 +   Possible variants of the above queries are:
 <programlisting>
 SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
   FROM test_trgm
   ORDER BY dist LIMIT 10;
 +</programlisting>
 +   and
 +<programlisting>
 +SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
 +  FROM test_trgm
 +  ORDER BY dist LIMIT 10;
 </programlisting>
    This can be implemented quite efficiently by GiST indexes, but not
    by GIN indexes.