1 {-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
    2 
    3 -- |
    4 -- Module      : Data.Text.Lazy.Encoding.Fusion
    5 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
    6 --
    7 -- License     : BSD-style
    8 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com, 
    9 --               duncan@haskell.org
   10 -- Stability   : experimental
   11 -- Portability : portable
   12 --
   13 -- Fusible 'Stream'-oriented functions for converting between lazy
   14 -- 'Text' and several common encodings.
   15 
   16 module Data.Text.Lazy.Encoding.Fusion
   17     (
   18     -- * Streaming
   19     --  streamASCII
   20       streamUtf8
   21     , streamUtf16LE
   22     , streamUtf16BE
   23     , streamUtf32LE
   24     , streamUtf32BE
   25 
   26     -- * Unstreaming
   27     , unstream
   28 
   29     , module Data.Text.Encoding.Fusion.Common
   30     ) where
   31 
   32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
   33 import qualified Data.ByteString as B
   34 import qualified Data.ByteString.Unsafe as B
   35 import Data.Text.Encoding.Fusion.Common
   36 import Data.Text.Encoding.Error
   37 import Data.Text.Fusion (Step(..), Stream(..))
   38 import Data.Text.Fusion.Size
   39 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
   40 import Data.Text.UnsafeShift (shiftL)
   41 import Data.Word (Word8, Word16, Word32)
   42 import qualified Data.Text.Encoding.Utf8 as U8
   43 import qualified Data.Text.Encoding.Utf16 as U16
   44 import qualified Data.Text.Encoding.Utf32 as U32
   45 import System.IO.Unsafe (unsafePerformIO)
   46 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
   47 import Foreign.Storable (pokeByteOff)
   48 import Data.ByteString.Internal (mallocByteString, memcpy)
   49 #if defined(ASSERTS)
   50 import Control.Exception (assert)
   51 #endif
   52 import qualified Data.ByteString.Internal as B
   53 
   54 data S = S0
   55        | S1 {-# UNPACK #-} !Word8
   56        | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
   57        | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
   58        | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
   59 
   60 data T = T !ByteString !S {-# UNPACK #-} !Int
   61 
   62 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
   63 -- UTF-8 encoding.
   64 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
   65 -- entered 100 timesstreamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
   66   where
   67     next (T bs@(Chunk ps _) S0 i)
   68       | i < len && U8.validate1 a =
   69           Yield (unsafeChr8 a)    (T bs S0 (i+1))
   70       | i + 1 < len && U8.validate2 a b =
   71           Yield (U8.chr2 a b)     (T bs S0 (i+2))
   72       | i + 2 < len && U8.validate3 a b c =
   73           Yield (U8.chr3 a b c)   (T bs S0 (i+3))
   74       | i + 3 < len && U8.validate4 a b c d =
   75           Yield (U8.chr4 a b c d) (T bs S0 (i+4))
   76       where len = B.length ps
   77             a = B.unsafeIndex ps i
   78             b = B.unsafeIndex ps (i+1)
   79             c = B.unsafeIndex ps (i+2)
   80             d = B.unsafeIndex ps (i+3)
   81     next st@(T bs s i) =
   82       case s of
   83         S1 a       | U8.validate1 a       -> Yield (unsafeChr8 a)    es
   84         S2 a b     | U8.validate2 a b     -> Yield (U8.chr2 a b)     es
   85         S3 a b c   | U8.validate3 a b c   -> Yield (U8.chr3 a b c)   es
   86         S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
   87         _ -> consume st
   88        where es = T bs S0 i
   89     consume (T bs@(Chunk ps rest) s i)
   90         | i >= B.length ps = consume (T rest s 0)
   91         | otherwise =
   92       case s of
   93         S0         -> next (T bs (S1 x)       (i+1))
   94         S1 a       -> next (T bs (S2 a x)     (i+1))
   95         S2 a b     -> next (T bs (S3 a b x)   (i+1))
   96         S3 a b c   -> next (T bs (S4 a b c x) (i+1))
   97         S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
   98                            (T bs (S3 b c d)   (i+1))
   99         where x = B.unsafeIndex ps i
  100     consume (T Empty S0 _) = Done
  101     consume st             = decodeError "streamUtf8" "UTF-8" onErr Nothing st
  102 {-# INLINE [0] streamUtf8 #-}
  103 
  104 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
  105 -- endian UTF-16 encoding.
  106 streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
  107 -- entered 100 timesstreamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
  108   where
  109     next (T bs@(Chunk ps _) S0 i)
  110       | i + 1 < len && U16.validate1 x1 =
  111           Yield (unsafeChr x1)         (T bs S0 (i+2))
  112       | i + 3 < len && U16.validate2 x1 x2 =
  113           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
  114       where len = B.length ps
  115             x1   = c (idx  i)      (idx (i + 1))
  116             x2   = c (idx (i + 2)) (idx (i + 3))
  117             c w1 w2 = w1 + (w2 `shiftL` 8)
  118             idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
  119     next st@(T bs s i) =
  120       case s of
  121         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
  122           Yield (unsafeChr (c w1 w2))   es
  123         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
  124           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
  125         _ -> consume st
  126        where es = T bs S0 i
  127              c :: Word8 -> Word8 -> Word16
  128              c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8)
  129     consume (T bs@(Chunk ps rest) s i)
  130         | i >= B.length ps = consume (T rest s 0)
  131         | otherwise =
  132       case s of
  133         S0             -> next (T bs (S1 x)          (i+1))
  134         S1 w1          -> next (T bs (S2 w1 x)       (i+1))
  135         S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
  136         S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
  137         S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1)
  138                            (T bs (S3 w2 w3 w4)       (i+1))
  139         where x = B.unsafeIndex ps i
  140     consume (T Empty S0 _) = Done
  141     consume st             = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st
  142 {-# INLINE [0] streamUtf16LE #-}
  143 
  144 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
  145 -- endian UTF-16 encoding.
  146 streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
  147 -- entered 100 timesstreamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
  148   where
  149     next (T bs@(Chunk ps _) S0 i)
  150       | i + 1 < len && U16.validate1 x1 =
  151           Yield (unsafeChr x1)         (T bs S0 (i+2))
  152       | i + 3 < len && U16.validate2 x1 x2 =
  153           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
  154       where len = B.length ps
  155             x1   = c (idx  i)      (idx (i + 1))
  156             x2   = c (idx (i + 2)) (idx (i + 3))
  157             c w1 w2 = (w1 `shiftL` 8) + w2
  158             idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
  159     next st@(T bs s i) =
  160       case s of
  161         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
  162           Yield (unsafeChr (c w1 w2))   es
  163         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
  164           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
  165         _ -> consume st
  166        where es = T bs S0 i
  167              c :: Word8 -> Word8 -> Word16
  168              c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2
  169     consume (T bs@(Chunk ps rest) s i)
  170         | i >= B.length ps = consume (T rest s 0)
  171         | otherwise =
  172       case s of
  173         S0             -> next (T bs (S1 x)          (i+1))
  174         S1 w1          -> next (T bs (S2 w1 x)       (i+1))
  175         S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
  176         S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
  177         S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1)
  178                            (T bs (S3 w2 w3 w4)       (i+1))
  179         where x = B.unsafeIndex ps i
  180     consume (T Empty S0 _) = Done
  181     consume st             = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st
  182 {-# INLINE [0] streamUtf16BE #-}
  183 
  184 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
  185 -- endian UTF-32 encoding.
  186 streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
  187 -- entered 100 timesstreamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
  188   where
  189     next (T bs@(Chunk ps _) S0 i)
  190       | i + 3 < len && U32.validate x =
  191           Yield (unsafeChr32 x)       (T bs S0 (i+4))
  192       where len = B.length ps
  193             x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
  194             x1    = idx i
  195             x2    = idx (i+1)
  196             x3    = idx (i+2)
  197             x4    = idx (i+3)
  198             idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
  199     next st@(T bs s i) =
  200       case s of
  201         S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
  202           Yield (unsafeChr32 (c w1 w2 w3 w4)) es
  203         _ -> consume st
  204        where es = T bs S0 i
  205              c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
  206              c w1 w2 w3 w4 = shifted
  207               where
  208                shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
  209                x1 = fromIntegral w1
  210                x2 = fromIntegral w2
  211                x3 = fromIntegral w3
  212                x4 = fromIntegral w4
  213     consume (T bs@(Chunk ps rest) s i)
  214         | i >= B.length ps = consume (T rest s 0)
  215         | otherwise =
  216       case s of
  217         S0             -> next (T bs (S1 x)          (i+1))
  218         S1 w1          -> next (T bs (S2 w1 x)       (i+1))
  219         S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
  220         S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
  221         S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1)
  222                            (T bs (S3 w2 w3 w4)       (i+1))
  223         where x = B.unsafeIndex ps i
  224     consume (T Empty S0 _) = Done
  225     consume st             = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st
  226 {-# INLINE [0] streamUtf32BE #-}
  227 
  228 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
  229 -- endian UTF-32 encoding.
  230 streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
  231 -- entered 100 timesstreamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
  232   where
  233     next (T bs@(Chunk ps _) S0 i)
  234       | i + 3 < len && U32.validate x =
  235           Yield (unsafeChr32 x)       (T bs S0 (i+4))
  236       where len = B.length ps
  237             x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
  238             x1    = idx i
  239             x2    = idx (i+1)
  240             x3    = idx (i+2)
  241             x4    = idx (i+3)
  242             idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
  243     next st@(T bs s i) =
  244       case s of
  245         S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
  246           Yield (unsafeChr32 (c w1 w2 w3 w4)) es
  247         _ -> consume st
  248        where es = T bs S0 i
  249              c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
  250              c w1 w2 w3 w4 = shifted
  251               where
  252                shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
  253                x1 = fromIntegral w1
  254                x2 = fromIntegral w2
  255                x3 = fromIntegral w3
  256                x4 = fromIntegral w4
  257     consume (T bs@(Chunk ps rest) s i)
  258         | i >= B.length ps = consume (T rest s 0)
  259         | otherwise =
  260       case s of
  261         S0             -> next (T bs (S1 x)          (i+1))
  262         S1 w1          -> next (T bs (S2 w1 x)       (i+1))
  263         S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
  264         S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
  265         S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1)
  266                            (T bs (S3 w2 w3 w4)       (i+1))
  267         where x = B.unsafeIndex ps i
  268     consume (T Empty S0 _) = Done
  269     consume st             = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st
  270 {-# INLINE [0] streamUtf32LE #-}
  271 
  272 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
  273 unstreamChunks :: Int -> Stream Word8 -> ByteString
  274 -- entered 200 timesunstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
  275   where chunk s1 len1 = unsafePerformIO $ do
  276           let len = max 4 (min len1 chunkSize)
  277           mallocByteString len >>= loop len 0 s1
  278           where
  279             loop !n !off !s fp = case next s of
  280                 Done | off == 0 -> return Empty
  281                      | otherwise -> return $! Chunk (trimUp fp off) Empty
  282                 Skip s' -> loop n off s' fp
  283                 Yield x s'
  284                     | off == chunkSize -> do
  285                       let !newLen = n - off
  286                       return $! Chunk (trimUp fp off) (chunk s newLen)
  287                     | off == n -> realloc fp n off s' x
  288                     | otherwise -> do
  289                       withForeignPtr fp $ \p -> pokeByteOff p off x
  290                       loop n (off+1) s' fp
  291             {-# NOINLINE realloc #-}
  292             realloc fp n off s x = do
  293               let n' = min (n+n) chunkSize
  294               fp' <- copy0 fp n n'
  295               withForeignPtr fp' $ \p -> pokeByteOff p off x
  296               loop n' (off+1) s fp'
  297             trimUp fp off = B.PS fp 0 off
  298             copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
  299             copy0 !src !srcLen !destLen =
  300 #if defined(ASSERTS)
  301               assert (srcLen <= destLen) $
  302 #endif
  303               do
  304                 dest <- mallocByteString destLen
  305                 withForeignPtr src  $ \src'  ->
  306                     withForeignPtr dest $ \dest' ->
  307                         memcpy dest' src' (fromIntegral srcLen)
  308                 return dest
  309 
  310 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
  311 unstream :: Stream Word8 -> ByteString
  312 -- entered onceunstream = unstreamChunks defaultChunkSize
  313 
  314 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
  315             -> s -> Step s Char
  316 -- never entereddecodeError func kind onErr mb i =
  317     case onErr desc mb of
  318       Nothing -> Skip i
  319       Just c  -> Yield c i
  320     where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
  321                  kind ++ " stream"