From 523728defe8ebbca36f26602510b715013b11f00 Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Wed, 14 Sep 2022 18:12:51 +0200 Subject: [PATCH 1/5] ClassificationAnalyzer --- .../entity/arangosearch/AnalyzerType.java | 15 +++- .../analyzer/ClassificationAnalyzer.java | 64 +++++++++++++++ .../ClassificationAnalyzerProperties.java | 78 +++++++++++++++++++ .../java/com/arangodb/ArangoSearchTest.java | 24 ++++++ 4 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index 7a5ea5918..15832c441 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -24,5 +24,18 @@ * @author Michele Rastelli */ public enum AnalyzerType { - identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation, collation + identity, + delimiter, + stem, + norm, + ngram, + text, + pipeline, + stopwords, + aql, + geojson, + geopoint, + segmentation, + collation, + classification } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java new file mode 100644 index 000000000..55eb3a47a --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java @@ -0,0 +1,64 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of classifying tokens in the input text. It applies a user-provided supervised fastText word + * embedding model to classify the input text. It is able to classify individual tokens as well as entire inputs. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class ClassificationAnalyzer extends SearchAnalyzer { + public ClassificationAnalyzer() { + setType(AnalyzerType.classification); + } + + private ClassificationAnalyzerProperties properties; + + public ClassificationAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(ClassificationAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + ClassificationAnalyzer that = (ClassificationAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java new file mode 100644 index 000000000..76092580e --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java @@ -0,0 +1,78 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.velocypack.annotations.SerializedName; + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class ClassificationAnalyzerProperties { + + @SerializedName("model_location") + private String modelLocation; + + @SerializedName("top_k") + private Integer topK; + + private Double threshold; + + public String getModelLocation() { + return modelLocation; + } + + public void setModelLocation(String modelLocation) { + this.modelLocation = modelLocation; + } + + public Integer getTopK() { + return topK; + } + + public void setTopK(Integer topK) { + this.topK = topK; + } + + public Double getThreshold() { + return threshold; + } + + public void setThreshold(Double threshold) { + this.threshold = threshold; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ClassificationAnalyzerProperties that = (ClassificationAnalyzerProperties) o; + return Objects.equals(modelLocation, that.modelLocation) && Objects.equals(topK, that.topK) && Objects.equals(threshold, that.threshold); + } + + @Override + public int hashCode() { + return Objects.hash(modelLocation, topK, threshold); + } +} diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index 22494fb02..39b97f360 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -987,6 +987,30 @@ void collationAnalyzer(ArangoDatabase db) { createGetAndDeleteTypedAnalyzer(db, collationAnalyzer); } + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void classificationAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + ClassificationAnalyzerProperties properties = new ClassificationAnalyzerProperties(); + properties.setModelLocation("/foo/bar"); + properties.setTopK(2); + properties.setThreshold(.5); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + ClassificationAnalyzer analyzer = new ClassificationAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + @ParameterizedTest(name = "{index}") @MethodSource("dbs") void offsetFeature(ArangoDatabase db) { From 0c0a7466adf3f8a3da09b8876906a994adcc6d10 Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Thu, 15 Sep 2022 09:48:07 +0200 Subject: [PATCH 2/5] added test ML model to docker containers --- docker/foo.bin | Bin 0 -> 5537 bytes docker/start_db.sh | 7 +++++++ .../internal/velocypack/VPackDeserializers.java | 2 ++ src/test/java/com/arangodb/ArangoSearchTest.java | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 docker/foo.bin diff --git a/docker/foo.bin b/docker/foo.bin new file mode 100644 index 0000000000000000000000000000000000000000..97c4296cd3c37f0b8abffa39f6b2975acb2e2546 GIT binary patch literal 5537 zcmZ8l30RJ4_kY{>MWwfWF}AV|CB65#k&&gv7(!@7T4W7ntRa;wp(I39l9aNIw&&a` zR6Z(|q9R);qDH02g#YvO)tLWxuIpUqocs4XzjN-Mx8AOY)15g=RU{I5@h;cvQvZrF zd@lIEP2i50Z1k?)#$civpR4dL&AX5b_Py7p`)w2nT0ORUgQxG0%FEB&|3|gfFTf+f z??>$);IaPSY!4qFZ_j`qxW|vw)7RVE-}^`H@3F!2U*&pl&*f{qC;q_cDE|pRs>O>} zc`Wr_wRrLWZ-Iq0?j&cNb){ zHcIqH51?t)dNgt9O)h8UbjY+6)8+Y1aI9lDevW*`-OWBBX-v2d<{ryP;siN%tLZWq zBV7;4$F4%n%R88RISa02mZQx5G7`DfA8~Lamo&qiR@v0y-KF!fxkZ^d89cz+FfscS zIgDH6>;=*a>g;@xCdRlv#$`2TG`G49UYab0d9xZwazs4Niv36o{i9KC+EJ|XkY(?Z zM?l%;S7k1odAK=`&v+($H6_|D= zL4VcN_`qlwx%IXI9gFh7)>MIhE64`}X2zcFo+Sxt3&oFlv83=#s%@0(X4En?p(k|p zsGD*acergQe%soQWxgr|yI>ON%$7uF`Nk~EAQK;6pq z=z`HVxKjbsLE21AKc+On1*I^ovwy~^9z7*F^!IhRVZV&z43}fBtFLew;(GX;b`{#M z-$6-J7U(yZW73&2(pbVjw*`&dFh_IR&Of&SN9SYZ8)ar_{Q&pGiP@KR7To;ZUXb)w zjn%koqIk|@G^jJ9*#&Jd)@mt0>1$H7JRTcXw3B`b(fHejqgcm3x3vi)z%b+$9y8FU zS#|xHw#5n9=3&iow=*z48A-Uh60A5E1it%SA-(DW`rbYZW7@0W+MOgQoUt0s+$>0e zLL)AX%mc$A3e+(#AL=U1SZ#)jBtU96KCVASJ`KpUEy~=4+Jj8!l%aamSU!wPXxNFr zuItBQnhRlnkO|dP*Jk6krsDDKQf%yRPs^1v{p>aC2d%rg8 z9+gYpJ=I`dIwi1N&6w)k{1Zn3CgP7gybiCa`r6yjEZxNz{uQ)-HkA#t6SA*;N1q;YPB67 z9#W<{xd)&&q>+rttcFkO2F#t56T3!v)+keoDa|dgfZGH^Yu6B08$DF9j3c25R(R>| zGd%1)6yz3F!5>8?^i{A2S`T%H(N!<8;O!K&a8Y5-tJJCW-%~Np@dFNm40nA_i+rEf6?r!9K)#vxU@L`|ATYN>*T{5Onvr2ug@ z`n1$hnrUBqkaKBiE?OC>v9Y)NvxZ%>K)(NI*zMS#?yoiCaIqYXSs=rvMZLkU7oSjl zi8&0@(4}Isof~_n1a@q>fVQ{COJwHP!h_c;EY;Kn6oT%+>;WzMjB!*v+< zM?7vE?+xz3*GTB>|6ujX%XnwnYUpGf_cmx8(P}ZFhF_(r%fd)3S*%N~JO74mhnr-) z=NA}sHW}5ouOy#4EOGP5kWIV@-YoB_}t=)lJyFhF8Uo* z4u^wBXemt5kYi^m&FOgU$C9dt@wn-cESHyb3S%q!)02Zl;P5n>OaEqy3lE%xy-V~l zEXIHhnr2L%uKNm=tt+9xGg$IdyDGhvs738tL{#MKh+{U%Q|XDdxYgkk_u#QMbsh8) z74IK{^N0O0EJ+cTlVmb(hZ$YyqC@))Ri}@wZs2(P_b}DH0gpr+ggq@~cxYZKC$~a_ zmiqd_8RKj`CT@c1)h95}{S0{HckFsQBafczjj4*!2iW^66z=cVf-86H!8+(FmuOan2`6U4 zgE^WkYV|X;h^j=V=z&;ib`)Zxjq&tzIo9cR2_8y+#!EM%V88QPycrRP^Dj<D{{p2-XEDcS-G3R)pRma z8)LQwhTsu51y-P-#FBUFVdipUG9^U`>@ME`1@#)7u|SimO>Ks^1>YfeL^w9+Kjgw} z^1$ZtOfXj20bg6p*$S6ne0tTC<)$3Q2@Y+zx#1ld^t2i)PTNC(ebXmGn zIcjW7#WijwOz+oJ{LrCFBW;^NFFzk7k%(U`uc6oJJd^~h({}|oaIn8Fjq<4C!WX|s z%RB{EI&l@2_#8#|hw^mDZ<_RadntS?DMZVKd$Gv=JbuwRhgrdjbhWlDEn6@UTui%i z?!Go=*;(_TvZ)bIxA>w`s4iGng@XL%<7n@`0i)zf$PUbNLU;`}RASlY&0q(A??($*0H zDRd~eruhjT-FXcms|EV~p-VN-%FxE2*rQLhMyM{wi)72 z%RYkn{8G%BI2XUiSYXS-g}6^!o1V3N2pRc0bcvH4rnp+6dcFo7;vR+9w(3!4EWtX@ zNyuJQ=R7fZg7d#6L#tFg*cmbW$;k)A21{|%&97*}e|`tgHew1bE;;!r%W#=}D_l)K z50i#<;s~>gU{Tyf)~%cigq;M_I5AClo=CQY8&GuBVa_qZ#AJU{j+1u*oNKF<=oSW( zKU@`P`P51qD+b-*dtRIKOwyt2R+-R~Rc1`yc?nlNPlv(hYuFig8=MFA zXLi}r_{(({oabatx99hxSHu}usjJ5()+-b1d&XpybPOpEwuGft=J2tqkStq!1E%if zzc+LOVVq+;zUh3+xhmyD>JwMgMvfb>elX-5EJ4#cQ`#w~%Q`qB$sDOoeG+a-hM)p{ zdFVFoIn@m*my4w!#~3fS!csL7#u4_bMGVA?%e`X z?)M>e!d#rDR{|qH%TSA>fjD(lJg4F@6n0(B#c7M$@w>e-&FGM&VHZEZa)Wo=mI;xd zHaZl`Acc$3AB|*Y8+d8|h`^Mt}bQptut5-KKsl z%|aS0lJ)6d;*q$atcye*Pr=Hw9@yrI;GcQ}C2N)GqTGcrLHv*m-=|08=UjyPZ^rcU zSdN^uJO-Af=1eJDlLhd*&c3P$O=UV{ih42F`#C|Lk0CRBq(Ezbf6u8OoPoN{ler|- zW7sa%ViD5|(P_=!;QW&oou$!;dAFQVar-E8cYG#t2V-&gs%YFy4}o^WaIWoqG}teF zjv*Q9G~}%;>kQumurx_9vY4i^e-gE@Bsrh8-&UY`A+Ir$Kk-qI1tUThpXnB(b6FU1I_E9)#eTQ)_H@+3IFHYs?BpD*Mnc8w zy%O)*P%MenpqU;UafWsNc|SbgZpsqnq#-=bkY&x-OPZ=GoDZV>;5m36yNNoN9P z4OC&p2b7r1lF@kjb4|{>Dm~UP_kPaJS?}TZ=?6e#>>D!RBI6vYgSj*Kg!8XygTTPQ zxGe*fXzYw9aAia!X0ETslN#gT#iV&;Z(=3w_}!1JGt*Z|20h)8BQd*DO3X}$+!HPz_c zVIuB|mnn1k)gOY6=8zEquJ}bO9}OO=vB0x3@YH^gByZ;0oLScusfSJ-%$uaeENl0o zzIZo|cAAUTc?Y?pL1|pY-_3BbZW}a*jK%P8Gs)ZzJyhxzLtIEP4z3u$E*^f118>K0 zXUY!Z?Ji^bLMw^v4o!na1x1{%RwlfTYzNV^JdoSvf#c2j&+jOP9`{~jypAICacw4# zU^aYwuge;|r;u9fTzLBRIi#qqf`~jZ8+Y~?=sR`UI?c(!zir;YugP<`ES2XNR;x)} zelCRu!)>tny*_noZi1;D1z=DnhKg}DWbM6!T#L_3>{q^p{Mo$&x7CzDHq~cVZ8PAl z3X;mATu5z5#sL?Wp!{tqZgBJm%m}C>F`3dVJ5q;^JRVAVUK9y0PWoP;@ab@oXy*u# zD4&m_i@c}tp8nsnokGrI0xq~6@A-nK^K~yz#MgoexM(CF1x(Q5b3sSId(QaWPQduO zXXd}=2y+Ea54U%w&~L|c`*sj8foCUZ_ww_Fp1!``DDZ^4^k950>>}v=!xQv*Zr}cP zd@h&}^DX&vfy>VpaG|#+@@G97y*u^XjXx8(`GO0YLY>|_SJ9 z>$$H`^Sks}55KPv^!haTJbwfa2--pvjE{Y|UGG`XJ^E$}93fBt!4vU%J=Poh`g^oe y|H~Edf6jSc&s}((|BXF5|I+Tkg`I?5gg(BfXEx92F#!{FgvjH4)-y}sr2ijRLj0)! literal 0 HcmV?d00001 diff --git a/docker/start_db.sh b/docker/start_db.sh index a322aa97c..f5a946dbf 100755 --- a/docker/start_db.sh +++ b/docker/start_db.sh @@ -101,6 +101,13 @@ for a in ${COORDINATORS[*]} ; do curl -u root:test --insecure --fail "$SCHEME://$a/_api/version" done +echo "" +echo "" +echo "Copying test ML models into containers..." +for c in $(docker ps -a -f name=adb-.* -q) ; do + docker cp "$LOCATION"/foo.bin "$c":/tmp +done + echo "" echo "" echo "Done, your deployment is reachable at: " diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index 9178657ce..a9560a0db 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -102,6 +102,8 @@ public class VPackDeserializers { return context.deserialize(vpack, SegmentationAnalyzer.class); case collation: return context.deserialize(vpack, CollationAnalyzer.class); + case classification: + return context.deserialize(vpack, ClassificationAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index 39b97f360..ac26fa02f 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -994,7 +994,7 @@ void classificationAnalyzer(ArangoDatabase db) { assumeTrue(isEnterprise()); ClassificationAnalyzerProperties properties = new ClassificationAnalyzerProperties(); - properties.setModelLocation("/foo/bar"); + properties.setModelLocation("/tmp/foo.bin"); properties.setTopK(2); properties.setThreshold(.5); From 96d8b71223e277310bad9b8869b209879ef3ecbd Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Thu, 15 Sep 2022 09:58:56 +0200 Subject: [PATCH 3/5] NearestNeighborsAnalyzer --- .../entity/arangosearch/AnalyzerType.java | 3 +- .../analyzer/NearestNeighborsAnalyzer.java | 66 ++++++++++++++++++ .../NearestNeighborsAnalyzerProperties.java | 69 +++++++++++++++++++ .../velocypack/VPackDeserializers.java | 2 + .../java/com/arangodb/ArangoSearchTest.java | 23 +++++++ 5 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index 15832c441..a6867f189 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -37,5 +37,6 @@ public enum AnalyzerType { geopoint, segmentation, collation, - classification + classification, + nearest_neighbors } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java new file mode 100644 index 000000000..c8641db4f --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java @@ -0,0 +1,66 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of finding nearest neighbors of tokens in the input. It applies a user-provided supervised + * fastText word embedding model to retrieve nearest neighbor tokens in the text. It is able to find neighbors of + * individual tokens as well as entire input strings. For entire input strings, the Analyzer will return nearest + * neighbors for each token within the input string. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class NearestNeighborsAnalyzer extends SearchAnalyzer { + public NearestNeighborsAnalyzer() { + setType(AnalyzerType.nearest_neighbors); + } + + private NearestNeighborsAnalyzerProperties properties; + + public NearestNeighborsAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(NearestNeighborsAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + NearestNeighborsAnalyzer that = (NearestNeighborsAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java new file mode 100644 index 000000000..42335b299 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java @@ -0,0 +1,69 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.velocypack.annotations.SerializedName; + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class NearestNeighborsAnalyzerProperties { + + @SerializedName("model_location") + private String modelLocation; + + @SerializedName("top_k") + private Integer topK; + + + public String getModelLocation() { + return modelLocation; + } + + public void setModelLocation(String modelLocation) { + this.modelLocation = modelLocation; + } + + public Integer getTopK() { + return topK; + } + + public void setTopK(Integer topK) { + this.topK = topK; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NearestNeighborsAnalyzerProperties that = (NearestNeighborsAnalyzerProperties) o; + return Objects.equals(modelLocation, that.modelLocation) && Objects.equals(topK, that.topK); + } + + @Override + public int hashCode() { + return Objects.hash(modelLocation, topK); + } +} diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index a9560a0db..4fd1dedfb 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -104,6 +104,8 @@ public class VPackDeserializers { return context.deserialize(vpack, CollationAnalyzer.class); case classification: return context.deserialize(vpack, ClassificationAnalyzer.class); + case nearest_neighbors: + return context.deserialize(vpack, NearestNeighborsAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index ac26fa02f..947ae8f15 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -1011,6 +1011,29 @@ void classificationAnalyzer(ArangoDatabase db) { createGetAndDeleteTypedAnalyzer(db, analyzer); } + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void nearestNeighborsAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + NearestNeighborsAnalyzerProperties properties = new NearestNeighborsAnalyzerProperties(); + properties.setModelLocation("/tmp/foo.bin"); + properties.setTopK(2); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + NearestNeighborsAnalyzer analyzer = new NearestNeighborsAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + @ParameterizedTest(name = "{index}") @MethodSource("dbs") void offsetFeature(ArangoDatabase db) { From 722438de27de9c2f2108bcfd848ab00fd33c48f6 Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Thu, 15 Sep 2022 10:17:35 +0200 Subject: [PATCH 4/5] MinHashAnalyzer --- .../entity/arangosearch/AnalyzerType.java | 3 +- .../analyzer/MinHashAnalyzer.java | 64 +++++++++++++++++++ .../analyzer/MinHashAnalyzerProperties.java | 63 ++++++++++++++++++ .../velocypack/VPackDeserializers.java | 2 + .../java/com/arangodb/ArangoSearchTest.java | 30 +++++++++ 5 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index a6867f189..acf38797d 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -38,5 +38,6 @@ public enum AnalyzerType { segmentation, collation, classification, - nearest_neighbors + nearest_neighbors, + minhash } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java new file mode 100644 index 000000000..116103e52 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java @@ -0,0 +1,64 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer that computes so called MinHash signatures using a locality-sensitive hash function. It applies an + * Analyzer of your choice before the hashing, for example, to break up text into words. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class MinHashAnalyzer extends SearchAnalyzer { + public MinHashAnalyzer() { + setType(AnalyzerType.minhash); + } + + private MinHashAnalyzerProperties properties; + + public MinHashAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(MinHashAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + MinHashAnalyzer that = (MinHashAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java new file mode 100644 index 000000000..a451c0525 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java @@ -0,0 +1,63 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class MinHashAnalyzerProperties { + + private SearchAnalyzer analyzer; + private Integer numHashes; + + public SearchAnalyzer getAnalyzer() { + return analyzer; + } + + public void setAnalyzer(SearchAnalyzer analyzer) { + this.analyzer = analyzer; + } + + public Integer getNumHashes() { + return numHashes; + } + + public void setNumHashes(Integer numHashes) { + this.numHashes = numHashes; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + MinHashAnalyzerProperties that = (MinHashAnalyzerProperties) o; + return Objects.equals(analyzer, that.analyzer) && Objects.equals(numHashes, that.numHashes); + } + + @Override + public int hashCode() { + return Objects.hash(analyzer, numHashes); + } +} diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index 4fd1dedfb..5c72b8e0e 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -106,6 +106,8 @@ public class VPackDeserializers { return context.deserialize(vpack, ClassificationAnalyzer.class); case nearest_neighbors: return context.deserialize(vpack, NearestNeighborsAnalyzer.class); + case minhash: + return context.deserialize(vpack, MinHashAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index 947ae8f15..3a3d66ce8 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -1034,6 +1034,36 @@ void nearestNeighborsAnalyzer(ArangoDatabase db) { createGetAndDeleteTypedAnalyzer(db, analyzer); } + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void MinHashAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + SegmentationAnalyzerProperties segProperties = new SegmentationAnalyzerProperties(); + segProperties.setBreakMode(SegmentationAnalyzerProperties.BreakMode.alpha); + segProperties.setAnalyzerCase(SearchAnalyzerCase.lower); + + SegmentationAnalyzer segAnalyzer = new SegmentationAnalyzer(); + segAnalyzer.setProperties(segProperties); + + MinHashAnalyzerProperties properties = new MinHashAnalyzerProperties(); + properties.setAnalyzer(segAnalyzer); + properties.setNumHashes(2); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + MinHashAnalyzer analyzer = new MinHashAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + @ParameterizedTest(name = "{index}") @MethodSource("dbs") void offsetFeature(ArangoDatabase db) { From b62b48f6250f7f946e75448465b5b0e19a38806e Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Thu, 15 Sep 2022 10:29:59 +0200 Subject: [PATCH 5/5] test fixes --- src/main/java/com/arangodb/entity/InvertedIndexField.java | 3 ++- src/test/resources/logback-test.xml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/arangodb/entity/InvertedIndexField.java b/src/main/java/com/arangodb/entity/InvertedIndexField.java index 92ee15cb1..016fb2c39 100644 --- a/src/main/java/com/arangodb/entity/InvertedIndexField.java +++ b/src/main/java/com/arangodb/entity/InvertedIndexField.java @@ -18,7 +18,7 @@ public class InvertedIndexField implements Entity { private Boolean searchField; private Boolean trackListPositions; private final Set features = new HashSet<>(); - private final Collection nested = new ArrayList<>(); + private Collection nested; public String getName() { return name; @@ -79,6 +79,7 @@ public Collection getNested() { } public InvertedIndexField nested(InvertedIndexField... nested) { + if(this.nested == null) this.nested = new ArrayList<>(); Collections.addAll(this.nested, nested); return this; } diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml index 579f1b9db..f67855e9c 100644 --- a/src/test/resources/logback-test.xml +++ b/src/test/resources/logback-test.xml @@ -8,7 +8,7 @@ - +