1 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
    2 
    3 -- |
    4 -- Module      : Data.Text.Lazy.Search
    5 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
    6 --
    7 -- License     : BSD-style
    8 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
    9 --               duncan@haskell.org
   10 -- Stability   : experimental
   11 -- Portability : GHC
   12 --
   13 -- Fast substring search for lazy 'Text', based on work by Boyer,
   14 -- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
   15 -- implementation.
   16 
   17 module Data.Text.Lazy.Search
   18     (
   19       indices
   20     ) where
   21 
   22 import qualified Data.Text.Array as A
   23 import Data.Int (Int64)
   24 import Data.Word (Word16, Word64)
   25 import qualified Data.Text.Internal as T
   26 import Data.Text.Fusion.Internal (PairS(..))
   27 import Data.Text.Lazy.Internal (Text(..), foldlChunks)
   28 import Data.Bits ((.|.), (.&.))
   29 import Data.Text.UnsafeShift (shiftL)
   30 
   31 -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
   32 -- @needle@ within @haystack@.
   33 --
   34 -- This function is strict in @needle@, and lazy (as far as possible)
   35 -- in the chunks of @haystack@.
   36 --
   37 -- In (unlikely) bad cases, this algorithm's complexity degrades
   38 -- towards /O(n*m)/.
   39 indices :: Text              -- ^ Substring to search for (@needle@)
   40         -> Text              -- ^ Text to search in (@haystack@)
   41         -> [Int64]
   42 -- entered 1817 timesindices needle@(Chunk n ns) _haystack@(Chunk k ks)
   43     | nlen <= 0  = []
   44     | nlen == 1  = indicesOne (nindex 0) 0 k ks
   45     | otherwise  = scan 0 0 k ks
   46   where
   47     scan !g !i x@(T.Text _ _ l) xs
   48          | i >= m = case xs of
   49                       Empty           -> []
   50                       Chunk y ys      -> scan g (i-m) y ys
   51          | lackingHay (i + nlen) x xs  = []
   52          | c == z && candidateMatch 0  = g : scan (g+nlen) (i+nlen) x xs
   53          | otherwise                   = scan (g+delta) (i+delta) x xs
   54        where
   55          m = fromIntegral l
   56          c = hindex (i + nlast)
   57          delta | nextInPattern = nlen + 1
   58                | c == z        = skip + 1
   59                | otherwise     = 1
   60          nextInPattern         = mask .&. swizzle (hindex (i+nlen)) == 0
   61          candidateMatch !j
   62              | j >= nlast               = True
   63              | hindex (i+j) /= nindex j = False
   64              | otherwise                = candidateMatch (j+1)
   65          hindex                         = index x xs
   66     nlen      = wordLength needle
   67     nlast     = nlen - 1
   68     nindex    = index n ns
   69     z         = foldlChunks fin 0 needle
   70         where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
   71     (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
   72     swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f)
   73     buildTable (T.Text xarr xoff xlen) xs = go
   74       where
   75         go !(g::Int64) !i !msk !skp
   76             | i >= xlast = case xs of
   77                              Empty      -> (msk .|. swizzle z) :*: skp
   78                              Chunk y ys -> buildTable y ys g 0 msk skp
   79             | otherwise = go (g+1) (i+1) (msk .|. swizzle c) skp'
   80             where c                = A.unsafeIndex xarr (xoff+i)
   81                   skp' | c == z    = nlen - fromIntegral g - 2
   82                        | otherwise = skp
   83                   xlast = xlen - 1
   84     -- | Check whether an attempt to index into the haystack at the
   85     -- given offset would fail.
   86     lackingHay q = go 0
   87       where
   88         go p (T.Text _ _ l) ps = p' < q && case ps of
   89                                              Empty      -> True
   90                                              Chunk r rs -> go p' r rs
   91             where p' = p + fromIntegral l
   92 indices _ _ = []
   93 
   94 -- | Fast index into a partly unpacked 'Text'.  We take into account
   95 -- the possibility that the caller might try to access one element
   96 -- past the end.
   97 index :: T.Text -> Text -> Int64 -> Word16
   98 -- entered 423,667 timesindex (T.Text arr off len) xs !i
   99     | j < len   = A.unsafeIndex arr (off+j)
  100     | otherwise = case xs of
  101                     Empty
  102                         -- out of bounds, but legal
  103                         | j == len  -> 0
  104                         -- should never happen, due to lackingHay above
  105                         | otherwise -> emptyError "index"
  106                     Chunk c cs -> index c cs (i-fromIntegral len)
  107     where j = fromIntegral i
  108 
  109 -- | A variant of 'indices' that scans linearly for a single 'Word16'.
  110 indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
  111 -- entered 21 timesindicesOne c = chunk
  112   where
  113     chunk !i (T.Text oarr ooff olen) os = go 0
  114       where
  115         go h | h >= olen = case os of
  116                              Empty      -> []
  117                              Chunk y ys -> chunk (i+fromIntegral olen) y ys
  118              | on == c = i + fromIntegral h : go (h+1)
  119              | otherwise = go (h+1)
  120              where on = A.unsafeIndex oarr (ooff+h)
  121 
  122 -- | The number of 'Word16' values in a 'Text'.
  123 wordLength :: Text -> Int64
  124 -- entered oncewordLength = foldlChunks sumLength 0
  125     where sumLength i (T.Text _ _ l) = i + fromIntegral l
  126 
  127 emptyError :: String -> a
  128 -- never enteredemptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")