1 {-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
    2 
    3 -- |
    4 -- Module      : Data.Text.Encoding.Fusion
    5 -- Copyright   : (c) Tom Harper 2008-2009,
    6 --               (c) Bryan O'Sullivan 2009,
    7 --               (c) Duncan Coutts 2009
    8 --
    9 -- License     : BSD-style
   10 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
   11 --               duncan@haskell.org
   12 -- Stability   : experimental
   13 -- Portability : portable
   14 --
   15 -- Fusible 'Stream'-oriented functions for converting between 'Text'
   16 -- and several common encodings.
   17 
   18 module Data.Text.Encoding.Fusion
   19     (
   20     -- * Streaming
   21       streamASCII
   22     , streamUtf8
   23     , streamUtf16LE
   24     , streamUtf16BE
   25     , streamUtf32LE
   26     , streamUtf32BE
   27 
   28     -- * Unstreaming
   29     , unstream
   30 
   31     , module Data.Text.Encoding.Fusion.Common
   32     ) where
   33 
   34 #if defined(ASSERTS)
   35 import Control.Exception (assert)
   36 #endif
   37 import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
   38 import Data.Text.Fusion (Step(..), Stream(..))
   39 import Data.Text.Fusion.Size
   40 import Data.Text.Encoding.Error
   41 import Data.Text.Encoding.Fusion.Common
   42 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
   43 import Data.Text.UnsafeShift (shiftL, shiftR)
   44 import Data.Word (Word8, Word16, Word32)
   45 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
   46 import Foreign.Storable (pokeByteOff)
   47 import System.IO.Unsafe (unsafePerformIO)
   48 import qualified Data.ByteString as B
   49 import qualified Data.ByteString.Unsafe as B
   50 import qualified Data.Text.Encoding.Utf8 as U8
   51 import qualified Data.Text.Encoding.Utf16 as U16
   52 import qualified Data.Text.Encoding.Utf32 as U32
   53 
   54 streamASCII :: ByteString -> Stream Char
   55 -- entered 182 timesstreamASCII bs = Stream next 0 (maxSize l)
   56     where
   57       l = B.length bs
   58       {-# INLINE next #-}
   59       next i
   60           | i >= l    = Done
   61           | otherwise = Yield (unsafeChr8 x1) (i+1)
   62           where
   63             x1 = B.unsafeIndex bs i
   64 {-# INLINE [0] streamASCII #-}
   65 
   66 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
   67 -- encoding.
   68 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
   69 -- entered 200 timesstreamUtf8 onErr bs = Stream next 0 (maxSize l)
   70     where
   71       l = B.length bs
   72       next i
   73           | i >= l = Done
   74           | U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1)
   75           | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
   76           | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
   77           | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
   78           | otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1)
   79           where
   80             x1 = idx i
   81             x2 = idx (i + 1)
   82             x3 = idx (i + 2)
   83             x4 = idx (i + 3)
   84             idx = B.unsafeIndex bs
   85 {-# INLINE [0] streamUtf8 #-}
   86 
   87 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
   88 -- endian UTF-16 encoding.
   89 streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
   90 -- entered 100 timesstreamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
   91     where
   92       l = B.length bs
   93       {-# INLINE next #-}
   94       next i
   95           | i >= l                         = Done
   96           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
   97           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
   98           | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
   99           where
  100             x1    = idx i       + (idx (i + 1) `shiftL` 8)
  101             x2    = idx (i + 2) + (idx (i + 3) `shiftL` 8)
  102             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
  103 {-# INLINE [0] streamUtf16LE #-}
  104 
  105 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
  106 -- endian UTF-16 encoding.
  107 streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
  108 -- entered 100 timesstreamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
  109     where
  110       l = B.length bs
  111       {-# INLINE next #-}
  112       next i
  113           | i >= l                         = Done
  114           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
  115           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
  116           | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
  117           where
  118             x1    = (idx i `shiftL` 8)       + idx (i + 1)
  119             x2    = (idx (i + 2) `shiftL` 8) + idx (i + 3)
  120             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
  121 {-# INLINE [0] streamUtf16BE #-}
  122 
  123 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
  124 -- endian UTF-32 encoding.
  125 streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
  126 -- entered 100 timesstreamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
  127     where
  128       l = B.length bs
  129       {-# INLINE next #-}
  130       next i
  131           | i >= l                    = Done
  132           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
  133           | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
  134           where
  135             x     = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
  136             x1    = idx i
  137             x2    = idx (i+1)
  138             x3    = idx (i+2)
  139             x4    = idx (i+3)
  140             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
  141 {-# INLINE [0] streamUtf32BE #-}
  142 
  143 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
  144 -- endian UTF-32 encoding.
  145 streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
  146 -- entered 100 timesstreamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
  147     where
  148       l = B.length bs
  149       {-# INLINE next #-}
  150       next i
  151           | i >= l                    = Done
  152           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
  153           | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
  154           where
  155             x     = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
  156             x1    = idx i
  157             x2    = idx $ i+1
  158             x3    = idx $ i+2
  159             x4    = idx $ i+3
  160             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
  161 {-# INLINE [0] streamUtf32LE #-}
  162 
  163 -- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'.
  164 unstream :: Stream Word8 -> ByteString
  165 -- entered 981 timesunstream (Stream next s0 len) = unsafePerformIO $ do
  166     let mlen = upperBound 4 len
  167     mallocByteString mlen >>= loop mlen 0 s0
  168     where
  169       loop !n !off !s fp = case next s of
  170           Done -> trimUp fp n off
  171           Skip s' -> loop n off s' fp
  172           Yield x s'
  173               | off == n -> realloc fp n off s' x
  174               | otherwise -> do
  175             withForeignPtr fp $ \p -> pokeByteOff p off x
  176             loop n (off+1) s' fp
  177       {-# NOINLINE realloc #-}
  178       realloc fp n off s x = do
  179         let n' = n+n
  180         fp' <- copy0 fp n n'
  181         withForeignPtr fp' $ \p -> pokeByteOff p off x
  182         loop n' (off+1) s fp'
  183       {-# NOINLINE trimUp #-}
  184       trimUp fp _ off = return $! PS fp 0 off
  185       copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
  186       copy0 !src !srcLen !destLen =
  187 #if defined(ASSERTS)
  188         assert (srcLen <= destLen) $
  189 #endif
  190         do
  191           dest <- mallocByteString destLen
  192           withForeignPtr src  $ \src'  ->
  193               withForeignPtr dest $ \dest' ->
  194                   memcpy dest' src' (fromIntegral srcLen)
  195           return dest
  196 
  197 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
  198             -> s -> Step s Char
  199 -- entered 758 timesdecodeError func kind onErr mb i =
  200     case onErr desc mb of
  201       Nothing -> Skip i
  202       Just c  -> Yield c i
  203     where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++
  204                  kind ++ " stream"