From b6f9d6f2c56074a3f46dbd8635560af5e2f4d420 Mon Sep 17 00:00:00 2001 From: "simonpj@microsoft.com" Date: Wed, 22 Dec 2010 17:51:24 +0000 Subject: [PATCH] Add fuzzyLookup, a variant of fuzzyMatch Plus, I changed quite a bit of layout to make the lines shorter. --- compiler/utils/Util.lhs | 109 +++++++++++++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/compiler/utils/Util.lhs b/compiler/utils/Util.lhs index 01a293d..b08f6fa 100644 --- a/compiler/utils/Util.lhs +++ b/compiler/utils/Util.lhs @@ -49,7 +49,7 @@ module Util ( removeSpaces, -- * Edit distance - fuzzyMatch, + fuzzyMatch, fuzzyLookup, -- * Transitive closures transitiveClosure, @@ -690,40 +690,61 @@ removeSpaces = reverse . dropWhile isSpace . reverse . dropWhile isSpace %************************************************************************ \begin{code} --- | Find the "restricted" Damerau-Levenshtein edit distance between two strings. See: . --- Based on the algorithm presented in "A Bit-Vector Algorithm for Computing Levenshtein and Damerau Edit Distances" in PSC'02 (Heikki Hyyro). --- See http://www.cs.uta.fi/~helmu/pubs/psc02.pdf and http://www.cs.uta.fi/~helmu/pubs/PSCerr.html for an explanation +-- | Find the "restricted" Damerau-Levenshtein edit distance between two strings. +-- See: . +-- Based on the algorithm presented in "A Bit-Vector Algorithm for Computing +-- Levenshtein and Damerau Edit Distances" in PSC'02 (Heikki Hyyro). +-- See http://www.cs.uta.fi/~helmu/pubs/psc02.pdf and +-- http://www.cs.uta.fi/~helmu/pubs/PSCerr.html for an explanation restrictedDamerauLevenshteinDistance :: String -> String -> Int -restrictedDamerauLevenshteinDistance str1 str2 = restrictedDamerauLevenshteinDistanceWithLengths m n str1 str2 +restrictedDamerauLevenshteinDistance str1 str2 + = restrictedDamerauLevenshteinDistanceWithLengths m n str1 str2 where m = length str1 n = length str2 -restrictedDamerauLevenshteinDistanceWithLengths :: Int -> Int -> String -> String -> Int +restrictedDamerauLevenshteinDistanceWithLengths + :: Int -> Int -> String -> String -> Int restrictedDamerauLevenshteinDistanceWithLengths m n str1 str2 - | m <= n = if n <= 32 -- n must be larger so this check is sufficient - then restrictedDamerauLevenshteinDistance' (undefined :: Word32) m n str1 str2 - else restrictedDamerauLevenshteinDistance' (undefined :: Integer) m n str1 str2 - | otherwise = if m <= 32 -- m must be larger so this check is sufficient - then restrictedDamerauLevenshteinDistance' (undefined :: Word32) n m str2 str1 - else restrictedDamerauLevenshteinDistance' (undefined :: Integer) n m str2 str1 - -restrictedDamerauLevenshteinDistance' :: (Bits bv) => bv -> Int -> Int -> String -> String -> Int + | m <= n + = if n <= 32 -- n must be larger so this check is sufficient + then restrictedDamerauLevenshteinDistance' (undefined :: Word32) m n str1 str2 + else restrictedDamerauLevenshteinDistance' (undefined :: Integer) m n str1 str2 + + | otherwise + = if m <= 32 -- m must be larger so this check is sufficient + then restrictedDamerauLevenshteinDistance' (undefined :: Word32) n m str2 str1 + else restrictedDamerauLevenshteinDistance' (undefined :: Integer) n m str2 str1 + +restrictedDamerauLevenshteinDistance' + :: (Bits bv) => bv -> Int -> Int -> String -> String -> Int restrictedDamerauLevenshteinDistance' _bv_dummy m n str1 str2 | [] <- str1 = n - | otherwise = extractAnswer $ foldl' (restrictedDamerauLevenshteinDistanceWorker (matchVectors str1) top_bit_mask vector_mask) (0, 0, m_ones, 0, m) str2 - where m_ones@vector_mask = (2 ^ m) - 1 - top_bit_mask = (1 `shiftL` (m - 1)) `asTypeOf` _bv_dummy - extractAnswer (_, _, _, _, distance) = distance - -restrictedDamerauLevenshteinDistanceWorker :: (Bits bv) => IM.IntMap bv -> bv -> bv -> (bv, bv, bv, bv, Int) -> Char -> (bv, bv, bv, bv, Int) -restrictedDamerauLevenshteinDistanceWorker str1_mvs top_bit_mask vector_mask (pm, d0, vp, vn, distance) char2 - = seq str1_mvs $ seq top_bit_mask $ seq vector_mask $ seq pm' $ seq d0' $ seq vp' $ seq vn' $ seq distance'' $ seq char2 $ (pm', d0', vp', vn', distance'') + | otherwise = extractAnswer $ + foldl' (restrictedDamerauLevenshteinDistanceWorker + (matchVectors str1) top_bit_mask vector_mask) + (0, 0, m_ones, 0, m) str2 + where + m_ones@vector_mask = (2 ^ m) - 1 + top_bit_mask = (1 `shiftL` (m - 1)) `asTypeOf` _bv_dummy + extractAnswer (_, _, _, _, distance) = distance + +restrictedDamerauLevenshteinDistanceWorker + :: (Bits bv) => IM.IntMap bv -> bv -> bv + -> (bv, bv, bv, bv, Int) -> Char -> (bv, bv, bv, bv, Int) +restrictedDamerauLevenshteinDistanceWorker str1_mvs top_bit_mask vector_mask + (pm, d0, vp, vn, distance) char2 + = seq str1_mvs $ seq top_bit_mask $ seq vector_mask $ + seq pm' $ seq d0' $ seq vp' $ seq vn' $ + seq distance'' $ seq char2 $ + (pm', d0', vp', vn', distance'') where pm' = IM.findWithDefault 0 (ord char2) str1_mvs - d0' = ((((sizedComplement vector_mask d0) .&. pm') `shiftL` 1) .&. pm) -- No need to mask the shiftL because of the restricted range of pm + d0' = ((((sizedComplement vector_mask d0) .&. pm') `shiftL` 1) .&. pm) .|. ((((pm' .&. vp) + vp) .&. vector_mask) `xor` vp) .|. pm' .|. vn + -- No need to mask the shiftL because of the restricted range of pm + hp' = vn .|. sizedComplement vector_mask (d0' .|. vp) hn' = d0' .&. vp @@ -746,11 +767,19 @@ matchVectors = snd . foldl' go (0 :: Int, IM.empty) in seq ix' $ seq im' $ (ix', im') #ifdef __GLASGOW_HASKELL__ -{-# SPECIALIZE INLINE restrictedDamerauLevenshteinDistance' :: Word32 -> Int -> Int -> String -> String -> Int #-} -{-# SPECIALIZE INLINE restrictedDamerauLevenshteinDistance' :: Integer -> Int -> Int -> String -> String -> Int #-} - -{-# SPECIALIZE restrictedDamerauLevenshteinDistanceWorker :: IM.IntMap Word32 -> Word32 -> Word32 -> (Word32, Word32, Word32, Word32, Int) -> Char -> (Word32, Word32, Word32, Word32, Int) #-} -{-# SPECIALIZE restrictedDamerauLevenshteinDistanceWorker :: IM.IntMap Integer -> Integer -> Integer -> (Integer, Integer, Integer, Integer, Int) -> Char -> (Integer, Integer, Integer, Integer, Int) #-} +{-# SPECIALIZE INLINE restrictedDamerauLevenshteinDistance' + :: Word32 -> Int -> Int -> String -> String -> Int #-} +{-# SPECIALIZE INLINE restrictedDamerauLevenshteinDistance' + :: Integer -> Int -> Int -> String -> String -> Int #-} + +{-# SPECIALIZE restrictedDamerauLevenshteinDistanceWorker + :: IM.IntMap Word32 -> Word32 -> Word32 + -> (Word32, Word32, Word32, Word32, Int) + -> Char -> (Word32, Word32, Word32, Word32, Int) #-} +{-# SPECIALIZE restrictedDamerauLevenshteinDistanceWorker + :: IM.IntMap Integer -> Integer -> Integer + -> (Integer, Integer, Integer, Integer, Int) + -> Char -> (Integer, Integer, Integer, Integer, Int) #-} {-# SPECIALIZE INLINE sizedComplement :: Word32 -> Word32 -> Word32 #-} {-# SPECIALIZE INLINE sizedComplement :: Integer -> Integer -> Integer #-} @@ -759,15 +788,23 @@ matchVectors = snd . foldl' go (0 :: Int, IM.empty) {-# SPECIALIZE matchVectors :: String -> IM.IntMap Integer #-} #endif --- | Search for possible matches to the users input in the given list, returning a small number of ranked results fuzzyMatch :: String -> [String] -> [String] -fuzzyMatch user_entered possibilites = map fst $ take mAX_RESULTS $ sortBy (comparing snd) - [ (poss, distance) | poss <- possibilites - , let distance = restrictedDamerauLevenshteinDistance poss user_entered - , distance <= fuzzy_threshold ] - where -- Work out an approriate match threshold (about a quarter of the # of characters the user entered) - fuzzy_threshold = max (round $ fromInteger (genericLength user_entered) / (4 :: Rational)) 1 - mAX_RESULTS = 3 +fuzzyMatch key vals = fuzzyLookup key [(v,v) | v <- vals] + +-- | Search for possible matches to the users input in the given list, +-- returning a small number of ranked results +fuzzyLookup :: String -> [(String,a)] -> [a] +fuzzyLookup user_entered possibilites + = map fst $ take mAX_RESULTS $ sortBy (comparing snd) + [ (poss_val, distance) | (poss_str, poss_val) <- possibilites + , let distance = restrictedDamerauLevenshteinDistance + poss_str user_entered + , distance <= fuzzy_threshold ] + where + -- Work out an approriate match threshold + -- (about a quarter of the # of characters the user entered) + fuzzy_threshold = max (round $ fromInteger (genericLength user_entered) / (4 :: Rational)) 1 + mAX_RESULTS = 3 \end{code} %************************************************************************ -- 1.7.10.4