1 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-} 2 3 -- | 4 -- Module : Data.Text.Lazy.Search 5 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 6 -- 7 -- License : BSD-style 8 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 -- duncan@haskell.org 10 -- Stability : experimental 11 -- Portability : GHC 12 -- 13 -- Fast substring search for lazy 'Text', based on work by Boyer, 14 -- Moore, Horspool, Sunday, and Lundh. Adapted from the strict 15 -- implementation. 16 17 module Data.Text.Lazy.Search 18 ( 19 indices 20 ) where 21 22 import qualified Data.Text.Array as A 23 import Data.Int (Int64) 24 import Data.Word (Word16, Word64) 25 import qualified Data.Text.Internal as T 26 import Data.Text.Fusion.Internal (PairS(..)) 27 import Data.Text.Lazy.Internal (Text(..), foldlChunks) 28 import Data.Bits ((.|.), (.&.)) 29 import Data.Text.UnsafeShift (shiftL) 30 31 -- | /O(n+m)/ Find the offsets of all non-overlapping indices of 32 -- @needle@ within @haystack@. 33 -- 34 -- This function is strict in @needle@, and lazy (as far as possible) 35 -- in the chunks of @haystack@. 36 -- 37 -- In (unlikely) bad cases, this algorithm's complexity degrades 38 -- towards /O(n*m)/. 39 indices :: Text -- ^ Substring to search for (@needle@) 40 -> Text -- ^ Text to search in (@haystack@) 41 -> [Int64] 42 -- entered 1817 timesindices needle@(Chunk n ns) _haystack@(Chunk k ks) 43 | nlen <= 0 = [] 44 | nlen == 1 = indicesOne (nindex 0) 0 k ks 45 | otherwise = scan 0 0 k ks 46 where 47 scan !g !i x@(T.Text _ _ l) xs 48 | i >= m = case xs of 49 Empty -> [] 50 Chunk y ys -> scan g (i-m) y ys 51 | lackingHay (i + nlen) x xs = [] 52 | c == z && candidateMatch 0 = g : scan (g+nlen) (i+nlen) x xs 53 | otherwise = scan (g+delta) (i+delta) x xs 54 where 55 m = fromIntegral l 56 c = hindex (i + nlast) 57 delta | nextInPattern = nlen + 1 58 | c == z = skip + 1 59 | otherwise = 1 60 nextInPattern = mask .&. swizzle (hindex (i+nlen)) == 0 61 candidateMatch !j 62 | j >= nlast = True 63 | hindex (i+j) /= nindex j = False 64 | otherwise = candidateMatch (j+1) 65 hindex = index x xs 66 nlen = wordLength needle 67 nlast = nlen - 1 68 nindex = index n ns 69 z = foldlChunks fin 0 needle 70 where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1) 71 (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2) 72 swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f) 73 buildTable (T.Text xarr xoff xlen) xs = go 74 where 75 go !(g::Int64) !i !msk !skp 76 | i >= xlast = case xs of 77 Empty -> (msk .|. swizzle z) :*: skp 78 Chunk y ys -> buildTable y ys g 0 msk skp 79 | otherwise = go (g+1) (i+1) (msk .|. swizzle c) skp' 80 where c = A.unsafeIndex xarr (xoff+i) 81 skp' | c == z = nlen - fromIntegral g - 2 82 | otherwise = skp 83 xlast = xlen - 1 84 -- | Check whether an attempt to index into the haystack at the 85 -- given offset would fail. 86 lackingHay q = go 0 87 where 88 go p (T.Text _ _ l) ps = p' < q && case ps of 89 Empty -> True 90 Chunk r rs -> go p' r rs 91 where p' = p + fromIntegral l 92 indices _ _ = [] 93 94 -- | Fast index into a partly unpacked 'Text'. We take into account 95 -- the possibility that the caller might try to access one element 96 -- past the end. 97 index :: T.Text -> Text -> Int64 -> Word16 98 -- entered 423,667 timesindex (T.Text arr off len) xs !i 99 | j < len = A.unsafeIndex arr (off+j) 100 | otherwise = case xs of 101 Empty 102 -- out of bounds, but legal 103 | j == len -> 0 104 -- should never happen, due to lackingHay above 105 | otherwise -> emptyError "index" 106 Chunk c cs -> index c cs (i-fromIntegral len) 107 where j = fromIntegral i 108 109 -- | A variant of 'indices' that scans linearly for a single 'Word16'. 110 indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64] 111 -- entered 21 timesindicesOne c = chunk 112 where 113 chunk !i (T.Text oarr ooff olen) os = go 0 114 where 115 go h | h >= olen = case os of 116 Empty -> [] 117 Chunk y ys -> chunk (i+fromIntegral olen) y ys 118 | on == c = i + fromIntegral h : go (h+1) 119 | otherwise = go (h+1) 120 where on = A.unsafeIndex oarr (ooff+h) 121 122 -- | The number of 'Word16' values in a 'Text'. 123 wordLength :: Text -> Int64 124 -- entered oncewordLength = foldlChunks sumLength 0 125 where sumLength i (T.Text _ _ l) = i + fromIntegral l 126 127 emptyError :: String -> a 128 -- never enteredemptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")