1 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
    2 
    3 -- |
    4 -- Module      : Data.Text.Search
    5 -- Copyright   : (c) Bryan O'Sullivan 2009
    6 --
    7 -- License     : BSD-style
    8 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
    9 --               duncan@haskell.org
   10 -- Stability   : experimental
   11 -- Portability : GHC
   12 --
   13 -- Fast substring search for 'Text', based on work by Boyer, Moore,
   14 -- Horspool, Sunday, and Lundh.
   15 --
   16 -- References:
   17 -- 
   18 -- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
   19 --   Communications of the ACM, 20, 10, 762-772 (1977)
   20 --
   21 -- * R. N. Horspool: Practical Fast Searching in Strings.  Software -
   22 --   Practice and Experience 10, 501-506 (1980)
   23 --
   24 -- * D. M. Sunday: A Very Fast Substring Search Algorithm.
   25 --   Communications of the ACM, 33, 8, 132-142 (1990)
   26 --
   27 -- * F. Lundh: The Fast Search Algorithm.
   28 --   <http://effbot.org/zone/stringlib.htm> (2006)
   29 
   30 module Data.Text.Search
   31     (
   32       indices
   33     ) where
   34 
   35 import qualified Data.Text.Array as A
   36 import Data.Word (Word64)
   37 import Data.Text.Internal (Text(..))
   38 import Data.Text.Fusion.Internal (PairS(..))
   39 import Data.Bits ((.|.), (.&.))
   40 import Data.Text.UnsafeShift (shiftL)
   41 
   42 -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
   43 -- @needle@ within @haystack@.  The offsets returned represent
   44 -- locations in the low-level array.
   45 --
   46 -- In (unlikely) bad cases, this algorithm's complexity degrades
   47 -- towards /O(n*m)/.
   48 indices :: Text                -- ^ Substring to search for (@needle@)
   49         -> Text                -- ^ Text to search in (@haystack@)
   50         -> [Int]
   51 -- entered 2048 timesindices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
   52     | nlen == 1              = scanOne (nindex 0)
   53     | nlen <= 0 || ldiff < 0 = []
   54     | otherwise              = scan 0
   55   where
   56     ldiff    = hlen - nlen
   57     nlast    = nlen - 1
   58     z        = nindex nlast
   59     nindex k = A.unsafeIndex narr (noff+k)
   60     hindex k = A.unsafeIndex harr (hoff+k)
   61     hindex' k | k == hlen  = 0
   62               | otherwise = A.unsafeIndex harr (hoff+k)
   63     (mask :: Word64) :*: skip  = buildTable 0 0 (nlen-2)
   64     buildTable !i !msk !skp
   65         | i >= nlast           = (msk .|. swizzle z) :*: skp
   66         | otherwise            = buildTable (i+1) (msk .|. swizzle c) skp'
   67         where c                = nindex i
   68               skp' | c == z    = nlen - i - 2
   69                    | otherwise = skp
   70     swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
   71     scan !i
   72         | i > ldiff                  = []
   73         | c == z && candidateMatch 0 = i : scan (i + nlen)
   74         | otherwise                  = scan (i + delta)
   75         where c = hindex (i + nlast)
   76               candidateMatch !j
   77                     | j >= nlast               = True
   78                     | hindex (i+j) /= nindex j = False
   79                     | otherwise                = candidateMatch (j+1)
   80               delta | nextInPattern = nlen + 1
   81                     | c == z        = skip + 1
   82                     | otherwise     = 1
   83               nextInPattern         = mask .&. swizzle (hindex' (i+nlen)) == 0
   84     scanOne c = loop 0
   85         where loop !i | i >= hlen     = []
   86                       | hindex i == c = i : loop (i+1)
   87                       | otherwise     = loop (i+1)
   88 {-# INLINE indices #-}