-
-
Notifications
You must be signed in to change notification settings - Fork 391
Speed up fuzzy search #2639
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up fuzzy search #2639
Changes from all commits
1a3cb84
9436cdf
b71acb4
6e84f7b
01ba2c9
175d1b6
ae8d61f
c00f7e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,96 +1,91 @@ | ||
-- | Parallel versions of 'filter' and 'simpleFilter' | ||
|
||
module Text.Fuzzy.Parallel | ||
( filter, | ||
simpleFilter, | ||
Scored(..), | ||
-- reexports | ||
Fuzzy, | ||
match, | ||
Scored(..) | ||
) where | ||
|
||
import Control.Monad.ST (runST) | ||
import Control.Parallel.Strategies (Eval, Strategy, evalTraversable, | ||
parTraversable, rseq, using) | ||
import Data.Monoid.Textual (TextualMonoid) | ||
import Data.Vector (Vector, (!)) | ||
import qualified Data.Vector as V | ||
-- need to use a stable sort | ||
import Data.Bifunctor (second) | ||
import Data.Char (toLower) | ||
import Data.Maybe (fromMaybe) | ||
import qualified Data.Monoid.Textual as T | ||
import Control.Parallel.Strategies (rseq, using, parList, evalList) | ||
import Data.Bits ((.|.)) | ||
import Data.Maybe (fromMaybe, mapMaybe) | ||
import qualified Data.Text as T | ||
import qualified Data.Text.Internal as T | ||
import qualified Data.Text.Array as TA | ||
import Prelude hiding (filter) | ||
import Text.Fuzzy (Fuzzy (..)) | ||
|
||
data Scored a = Scored {score_ :: !Int, original:: !a} | ||
deriving (Functor,Show) | ||
data Scored a = Scored {score :: !Int, original:: !a} | ||
deriving (Functor, Show) | ||
|
||
-- | Returns the rendered output and the | ||
-- matching score for a pattern and a text. | ||
-- Two examples are given below: | ||
-- | ||
-- >>> match "fnt" "infinite" "" "" id True | ||
-- Just ("infinite",3) | ||
-- >>> match "fnt" "infinite" | ||
-- Just 3 | ||
-- | ||
-- >>> match "hsk" ("Haskell",1995) "<" ">" fst False | ||
-- Just ("<h>a<s><k>ell",5) | ||
-- >>> match "hsk" "Haskell" | ||
-- Just 5 | ||
-- | ||
{-# INLINABLE match #-} | ||
|
||
match :: (T.TextualMonoid s) | ||
=> s -- ^ Pattern in lowercase except for first character | ||
-> t -- ^ The value containing the text to search in. | ||
-> s -- ^ The text to add before each match. | ||
-> s -- ^ The text to add after each match. | ||
-> (t -> s) -- ^ The function to extract the text from the container. | ||
-> Maybe (Fuzzy t s) -- ^ The original value, rendered string and score. | ||
match pattern t pre post extract = | ||
if null pat then Just (Fuzzy t result totalScore) else Nothing | ||
match :: T.Text -- ^ Pattern in lowercase except for first character | ||
-> T.Text -- ^ The text to search in. | ||
-> Maybe Int -- ^ The score | ||
match (T.Text pArr pOff pLen) (T.Text sArr sOff sLen) = go 0 1 pOff sOff | ||
where | ||
null :: (T.TextualMonoid s) => s -> Bool | ||
null = not . T.any (const True) | ||
|
||
s = extract t | ||
(totalScore, _currScore, result, pat, _) = | ||
T.foldl' | ||
undefined | ||
(\(tot, cur, res, pat, isFirst) c -> | ||
case T.splitCharacterPrefix pat of | ||
Nothing -> (tot, 0, res <> T.singleton c, pat, isFirst) | ||
Just (x, xs) -> | ||
-- the case of the first character has to match | ||
-- otherwise use lower case since the pattern is assumed lower | ||
let !c' = if isFirst then c else toLower c in | ||
if x == c' then | ||
let cur' = cur * 2 + 1 in | ||
(tot + cur', cur', res <> pre <> T.singleton c <> post, xs, False) | ||
else (tot, 0, res <> T.singleton c, pat, isFirst) | ||
) ( 0 | ||
, 1 -- matching at the start gives a bonus (cur = 1) | ||
, mempty, pattern, True) s | ||
pTotal = pOff + pLen | ||
sDelta = sOff + sLen - pTotal | ||
|
||
go !totalScore !currScore !currPOff !currSOff | ||
-- If pattern has been matched in full | ||
| currPOff >= pTotal | ||
= Just totalScore | ||
-- If there is not enough left to match the rest of the pattern, equivalent to | ||
-- (sOff + sLen - currSOff) < (pOff + pLen - currPOff) | ||
| currSOff > currPOff + sDelta | ||
= Nothing | ||
-- This is slightly broken for non-ASCII: | ||
-- 1. If code units, consisting a single pattern code point, are found as parts | ||
-- of different code points, it counts as a match. Unless you use a ton of emojis | ||
-- as identifiers, such false positives should not be be a big deal, | ||
-- and anyways HLS does not currently support such use cases, because it uses | ||
-- code point and UTF-16 code unit positions interchangeably. | ||
-- 2. Case conversions is not applied to non-ASCII code points, because one has | ||
-- to call T.toLower (not T.map toLower), reallocating the string in full, which | ||
-- is too much of performance penalty for fuzzy search. Again, anyway HLS does not | ||
-- attempt to do justice to Unicode: proper Unicode text matching requires | ||
-- `unicode-transforms` and friends. | ||
-- Altogether we sacrifice correctness for the sake of performance, which | ||
-- is a right trade-off for fuzzy search. | ||
| pByte <- TA.unsafeIndex pArr currPOff | ||
, sByte <- TA.unsafeIndex sArr currSOff | ||
-- First byte (currPOff == pOff) should match exactly, otherwise - up to case. | ||
, pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte) | ||
= let curr = currScore * 2 + 1 in | ||
go (totalScore + curr) curr (currPOff + 1) (currSOff + 1) | ||
| otherwise | ||
Comment on lines
+41
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do we know that the new implementation produces the same scores? I would like to see a unit test here, or a property test. It's ok for the test suite to depend on the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, even the old implementation was not fully matching There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ghcide test suite. |
||
= go totalScore 0 currPOff (currSOff + 1) | ||
|
||
toLowerAscii w = if (w - 65) < 26 then w .|. 0x20 else w | ||
|
||
-- | The function to filter a list of values by fuzzy search on the text extracted from them. | ||
filter :: (TextualMonoid s) | ||
=> Int -- ^ Chunk size. 1000 works well. | ||
-> Int -- ^ Max. number of results wanted | ||
-> s -- ^ Pattern. | ||
-> [t] -- ^ The list of values containing the text to search in. | ||
-> s -- ^ The text to add before each match. | ||
-> s -- ^ The text to add after each match. | ||
-> (t -> s) -- ^ The function to extract the text from the container. | ||
-> [Scored t] -- ^ The list of results, sorted, highest score first. | ||
filter chunkSize maxRes pattern ts pre post extract = runST $ do | ||
let v = V.mapMaybe id | ||
(V.map (\t -> match pattern' t pre post extract) (V.fromList ts) | ||
`using` | ||
parVectorChunk chunkSize (evalTraversable forceScore)) | ||
perfectScore = score $ fromMaybe (error $ T.toString undefined pattern) $ | ||
match pattern' pattern' "" "" id | ||
return $ partialSortByAscScore maxRes perfectScore v | ||
filter :: Int -- ^ Chunk size. 1000 works well. | ||
-> Int -- ^ Max. number of results wanted | ||
-> T.Text -- ^ Pattern. | ||
-> [t] -- ^ The list of values containing the text to search in. | ||
-> (t -> T.Text) -- ^ The function to extract the text from the container. | ||
-> [Scored t] -- ^ The list of results, sorted, highest score first. | ||
filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss) | ||
where | ||
-- Preserve case for the first character, make all others lowercase | ||
pattern' = case T.splitCharacterPrefix pattern of | ||
Just (c, rest) -> T.singleton c <> T.map toLower rest | ||
_ -> pattern | ||
pattern' = case T.uncons pattern of | ||
Just (c, rest) -> T.cons c (T.toLower rest) | ||
_ -> pattern | ||
vss = map (mapMaybe (\t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts) | ||
`using` parList (evalList rseq) | ||
pepeiborra marked this conversation as resolved.
Show resolved
Hide resolved
|
||
perfectScore = fromMaybe (error $ T.unpack pattern) $ match pattern' pattern' | ||
|
||
-- | Return all elements of the list that have a fuzzy | ||
-- match against the pattern. Runs with default settings where | ||
|
@@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do | |
-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"] | ||
-- ["vim","virtual machine"] | ||
{-# INLINABLE simpleFilter #-} | ||
simpleFilter :: (TextualMonoid s) | ||
=> Int -- ^ Chunk size. 1000 works well. | ||
-> Int -- ^ Max. number of results wanted | ||
-> s -- ^ Pattern to look for. | ||
-> [s] -- ^ List of texts to check. | ||
-> [Scored s] -- ^ The ones that match. | ||
simpleFilter :: Int -- ^ Chunk size. 1000 works well. | ||
-> Int -- ^ Max. number of results wanted | ||
-> T.Text -- ^ Pattern to look for. | ||
-> [T.Text] -- ^ List of texts to check. | ||
-> [Scored T.Text] -- ^ The ones that match. | ||
simpleFilter chunk maxRes pattern xs = | ||
filter chunk maxRes pattern xs mempty mempty id | ||
|
||
-------------------------------------------------------------------------------- | ||
|
||
-- | Evaluation that forces the 'score' field | ||
forceScore :: TextualMonoid s => Fuzzy t s -> Eval(Fuzzy t s) | ||
forceScore it@Fuzzy{score} = do | ||
score' <- rseq score | ||
return it{score = score'} | ||
filter chunk maxRes pattern xs id | ||
|
||
-------------------------------------------------------------------------------- | ||
|
||
-- | Divides a vector in chunks, applies the strategy in parallel to each chunk. | ||
parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a) | ||
parVectorChunk chunkSize st v = | ||
V.concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v) | ||
|
||
-- >>> chunkVector 3 (V.fromList [0..10]) | ||
-- >>> chunkVector 3 (V.fromList [0..11]) | ||
-- >>> chunkVector 3 (V.fromList [0..12]) | ||
-- [[0,1,2],[3,4,5],[6,7,8],[9,10]] | ||
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]] | ||
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]] | ||
chunkVector :: Int -> Vector a -> [Vector a] | ||
chunkVector chunkSize v = do | ||
let indices = chunkIndices chunkSize (0,V.length v) | ||
[V.slice l (h-l+1) v | (l,h) <- indices] | ||
|
||
-- >>> chunkIndices 3 (0,9) | ||
-- >>> chunkIndices 3 (0,10) | ||
-- >>> chunkIndices 3 (0,11) | ||
-- [(0,2),(3,5),(6,8)] | ||
-- [(0,2),(3,5),(6,8),(9,9)] | ||
-- [(0,2),(3,5),(6,8),(9,10)] | ||
chunkIndices :: Int -> (Int,Int) -> [(Int,Int)] | ||
chunkIndices chunkSize (from,to) = | ||
map (second pred) $ | ||
pairwise $ | ||
[from, from+chunkSize .. to-1] ++ [to] | ||
|
||
pairwise :: [a] -> [(a,a)] | ||
pairwise [] = [] | ||
pairwise [_] = [] | ||
pairwise (x:y:xs) = (x,y) : pairwise (y:xs) | ||
chunkList :: Int -> [a] -> [[a]] | ||
chunkList chunkSize = go | ||
where | ||
go [] = [] | ||
go xs = ys : go zs | ||
where | ||
(ys, zs) = splitAt chunkSize xs | ||
|
||
-- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case | ||
partialSortByAscScore :: TextualMonoid s | ||
=> Int -- ^ Number of items needed | ||
partialSortByAscScore | ||
:: Int -- ^ Number of items needed | ||
-> Int -- ^ Value of a perfect score | ||
-> Vector (Fuzzy t s) | ||
-> [Scored t] | ||
partialSortByAscScore wantedCount perfectScore v = loop 0 (SortState minBound perfectScore 0) [] where | ||
l = V.length v | ||
loop index st@SortState{..} acc | ||
-> [Scored t] | ||
pepeiborra marked this conversation as resolved.
Show resolved
Hide resolved
|
||
partialSortByAscScore wantedCount perfectScore orig = loop orig (SortState minBound perfectScore 0) [] where | ||
loop [] st@SortState{..} acc | ||
| foundCount == wantedCount = reverse acc | ||
| index == l | ||
-- ProgressCancelledException | ||
= if bestScoreSeen < scoreWanted | ||
then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc | ||
| otherwise = if bestScoreSeen < scoreWanted | ||
then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc | ||
else reverse acc | ||
| otherwise = | ||
case v!index of | ||
x | score x == scoreWanted | ||
-> loop (index+1) st{foundCount = foundCount+1} (toScored x:acc) | ||
| score x < scoreWanted && score x > bestScoreSeen | ||
-> loop (index+1) st{bestScoreSeen = score x} acc | ||
| otherwise | ||
-> loop (index+1) st acc | ||
|
||
toScored :: TextualMonoid s => Fuzzy t s -> Scored t | ||
toScored Fuzzy{..} = Scored score original | ||
loop (x : xs) st@SortState{..} acc | ||
| foundCount == wantedCount = reverse acc | ||
| score x == scoreWanted | ||
= loop xs st{foundCount = foundCount+1} (x:acc) | ||
| score x < scoreWanted && score x > bestScoreSeen | ||
= loop xs st{bestScoreSeen = score x} acc | ||
| otherwise | ||
= loop xs st acc | ||
|
||
data SortState a = SortState | ||
{ bestScoreSeen :: !Int | ||
|
Uh oh!
There was an error while loading. Please reload this page.