Skip to content

[DE-526] Search optimisation #503

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ public final class InvertedIndexEntity {
private Collection<InvertedIndexField> fields;
private Boolean searchField;
private Collection<StoredValue> storedValues;
private Collection<String> optimizeTopK;
private InvertedIndexPrimarySort primarySort;
private String analyzer;
private Set<AnalyzerFeature> features;
Expand Down Expand Up @@ -104,6 +105,10 @@ public Collection<StoredValue> getStoredValues() {
return storedValues;
}

public Collection<String> getOptimizeTopK() {
return optimizeTopK;
}

public InvertedIndexPrimarySort getPrimarySort() {
return primarySort;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public final class ArangoSearchPropertiesEntity extends ViewEntity {
private Collection<CollectionLink> links;
private ArangoSearchCompression primarySortCompression;
private Collection<StoredValue> storedValues;
private Collection<String> optimizeTopK;
private Boolean primarySortCache;
private Boolean primaryKeyCache;

Expand Down Expand Up @@ -121,6 +122,14 @@ public Collection<StoredValue> getStoredValues() {
return storedValues;
}

/**
* @return An array of strings defining optimized sort expressions.
* @since ArangoDB 3.11, Enterprise Edition only
*/
public Collection<String> getOptimizeTopK() {
return optimizeTopK;
}

public Boolean getPrimarySortCache() {
return primarySortCache;
}
Expand Down
15 changes: 15 additions & 0 deletions core/src/main/java/com/arangodb/model/InvertedIndexOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class InvertedIndexOptions extends IndexOptions<InvertedIndexOption
private Integer parallelism;
private InvertedIndexPrimarySort primarySort;
private final Collection<StoredValue> storedValues = new ArrayList<>();
private final Collection<String> optimizeTopK = new ArrayList<>();
private String analyzer;
private final Set<AnalyzerFeature> features = new HashSet<>();
private Boolean includeAllFields;
Expand Down Expand Up @@ -112,6 +113,20 @@ public InvertedIndexOptions storedValues(StoredValue... storedValues) {
return this;
}

public Collection<String> getOptimizeTopK() {
return optimizeTopK;
}

/**
* @param optimizeTopK An array of strings defining sort expressions that you want to optimize.
* @return options
* @since ArangoDB 3.11, Enterprise Edition only
*/
public InvertedIndexOptions optimizeTopK(String... optimizeTopK) {
Collections.addAll(this.optimizeTopK, optimizeTopK);
return this;
}

public String getAnalyzer() {
return analyzer;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public final class ArangoSearchCreateOptions {
private Collection<PrimarySort> primarySorts;
private ArangoSearchCompression primarySortCompression;
private Collection<StoredValue> storedValues;
private Collection<String> optimizeTopK;
private Boolean primarySortCache;
private Boolean primaryKeyCache;

Expand All @@ -57,14 +58,11 @@ ArangoSearchCreateOptions name(final String name) {
}

/**
* @param consolidationIntervalMsec Wait at least this many milliseconds between committing index data changes
* and making them visible to
* queries (default: 60000, to disable use: 0). For the case where there are a
* lot of inserts/updates, a
* lower value, until commit, will cause the index not to account for them and
* memory usage would
* continue to grow. For the case where there are a few inserts/updates, a
* higher value will impact
* @param consolidationIntervalMsec Wait at least this many milliseconds between committing index data changes and
* making them visible to queries (default: 60000, to disable use: 0). For the case
* where there are a lot of inserts/updates, a lower value, until commit, will
* cause the index not to account for them and memory usage would continue to grow.
* For the case where there are a few inserts/updates, a higher value will impact
* performance and waste disk space for each commit call without any added
* benefits.
* @return options
Expand All @@ -76,26 +74,19 @@ public ArangoSearchCreateOptions consolidationIntervalMsec(final Long consolidat

/**
* @param commitIntervalMsec Wait at least this many milliseconds between committing view data store changes and
* making documents visible to
* queries (default: 1000, to disable use: 0). For the case where there are a lot of
* inserts/updates, a lower value,
* until commit, will cause the index not to account for them and memory usage would
* continue to grow. For the case
* where there are a few inserts/updates, a higher value will impact performance and
* waste disk space for each
* commit call without any added benefits. Background: For data retrieval ArangoSearch
* views follow the concept of
* “eventually-consistent”, i.e. eventually all the data in ArangoDB will be matched by
* corresponding query
* expressions. The concept of ArangoSearch view “commit” operation is introduced to
* control the upper-bound on the
* time until document addition/removals are actually reflected by corresponding query
* expressions. Once a “commit”
* operation is complete all documents added/removed prior to the start of the “commit”
* operation will be reflected
* by queries invoked in subsequent ArangoDB transactions, in-progress ArangoDB
* transactions will still continue to
* return a repeatable-read state.
* making documents visible to queries (default: 1000, to disable use: 0). For the case
* where there are a lot of inserts/updates, a lower value, until commit, will cause the
* index not to account for them and memory usage would continue to grow. For the case
* where there are a few inserts/updates, a higher value will impact performance and waste
* disk space for each commit call without any added benefits. Background: For data
* retrieval ArangoSearch views follow the concept of “eventually-consistent”, i.e.
* eventually all the data in ArangoDB will be matched by corresponding query expressions.
* The concept of ArangoSearch view “commit” operation is introduced to control the
* upper-bound on the time until document addition/removals are actually reflected by
* corresponding query expressions. Once a “commit” operation is complete all documents
* added/removed prior to the start of the “commit” operation will be reflected by queries
* invoked in subsequent ArangoDB transactions, in-progress ArangoDB transactions will
* still continue to return a repeatable-read state.
* @return options
*/
public ArangoSearchCreateOptions commitIntervalMsec(final Long commitIntervalMsec) {
Expand All @@ -105,14 +96,11 @@ public ArangoSearchCreateOptions commitIntervalMsec(final Long commitIntervalMse

/**
* @param cleanupIntervalStep Wait at least this many commits between removing unused files in data directory
* (default: 10, to
* disable use: 0). For the case where the consolidation policies merge segments often
* (i.e. a lot of
* commit+consolidate), a lower value will cause a lot of disk space to be wasted. For
* the case where the
* consolidation policies rarely merge segments (i.e. few inserts/deletes), a higher
* value will impact
* performance without any added benefits.
* (default: 10, to disable use: 0). For the case where the consolidation policies merge
* segments often (i.e. a lot of commit+consolidate), a lower value will cause a lot of
* disk space to be wasted. For the case where the consolidation policies rarely merge
* segments (i.e. few inserts/deletes), a higher value will impact performance without
* any added benefits.
* @return options
*/
public ArangoSearchCreateOptions cleanupIntervalStep(final Long cleanupIntervalStep) {
Expand Down Expand Up @@ -164,6 +152,16 @@ public ArangoSearchCreateOptions storedValues(final StoredValue... storedValues)
return this;
}

/**
* @param optimizeTopK An array of strings defining sort expressions that you want to optimize.
* @return options
* @since ArangoDB 3.11, Enterprise Edition only
*/
public ArangoSearchCreateOptions optimizeTopK(final String... optimizeTopK) {
this.optimizeTopK = Arrays.asList(optimizeTopK);
return this;
}

/**
* @param primarySortCache If you enable this option, then the primary sort columns are always cached in memory.
* This can improve the performance of queries that utilize the primary sort order.
Expand Down Expand Up @@ -231,6 +229,10 @@ public Collection<StoredValue> getStoredValues() {
return storedValues;
}

public Collection<String> getOptimizeTopK() {
return optimizeTopK;
}

public Boolean getPrimarySortCache() {
return primarySortCache;
}
Expand Down
10 changes: 8 additions & 2 deletions driver/src/test/java/com/arangodb/ArangoSearchTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import com.arangodb.entity.arangosearch.analyzer.*;
import com.arangodb.model.InvertedIndexOptions;
import com.arangodb.model.arangosearch.*;
import com.arangodb.util.TestUtils;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
Expand Down Expand Up @@ -424,7 +423,7 @@ private void createGetAndDeleteTypedAnalyzer(ArangoDatabase db, SearchAnalyzer a
// getAnalyzers
SearchAnalyzer foundAnalyzer =
db.getSearchAnalyzers().stream().filter(it -> it.getName().equals(fullyQualifiedName))
.findFirst().get();
.findFirst().get();
assertThat(foundAnalyzer).isEqualTo(analyzer);

// deleteAnalyzer
Expand Down Expand Up @@ -672,6 +671,8 @@ void arangoSearchOptions(ArangoDatabase db) {
.primaryKeyCache(true);
StoredValue storedValue = new StoredValue(Arrays.asList("a", "b"), ArangoSearchCompression.none, true);
options.storedValues(storedValue);
String[] optimizeTopK = new String[]{"BM25(@doc) DESC", "TFIDF(@doc) DESC"};
options.optimizeTopK(optimizeTopK);

final ArangoSearch view = db.arangoSearch(viewName);
view.create(options);
Expand Down Expand Up @@ -713,6 +714,11 @@ void arangoSearchOptions(ArangoDatabase db) {
FieldLink nested = fieldLink.getNested().iterator().next();
assertThat(nested.getName()).isEqualTo("f2");
}

if (isEnterprise() && isAtLeastVersion(3, 11)) {
assertThat(properties.getOptimizeTopK()).containsExactly(optimizeTopK);
}

}

@ParameterizedTest(name = "{index}")
Expand Down
5 changes: 5 additions & 0 deletions driver/src/test/java/com/arangodb/InvertedIndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ private InvertedIndexOptions createOptions(String analyzerName) {
.cache(cache)
)
.storedValues(new StoredValue(Arrays.asList("f3", "f4"), ArangoSearchCompression.none, cache))
.optimizeTopK("BM25(@doc) DESC", "TFIDF(@doc) DESC")
.analyzer(analyzerName)
.features(AnalyzerFeature.position, AnalyzerFeature.frequency)
.includeAllFields(false)
Expand Down Expand Up @@ -144,6 +145,10 @@ private void assertCorrectIndexEntity(InvertedIndexEntity indexResult, InvertedI
assertThat(indexResult.getWritebufferSizeMax()).isEqualTo(options.getWritebufferSizeMax());
assertThat(indexResult.getCache()).isEqualTo(options.getCache());
assertThat(indexResult.getPrimaryKeyCache()).isEqualTo(options.getPrimaryKeyCache());

if (isEnterprise() && isAtLeastVersion(3, 11)) {
assertThat(indexResult.getOptimizeTopK()).containsExactlyElementsOf(options.getOptimizeTopK());
}
}

@ParameterizedTest(name = "{index}")
Expand Down