1 {-# LANGUAGE BangPatterns, CPP, Rank2Types #-} 2 3 -- | 4 -- Module : Data.Text.Lazy.Encoding.Fusion 5 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 6 -- 7 -- License : BSD-style 8 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 -- duncan@haskell.org 10 -- Stability : experimental 11 -- Portability : portable 12 -- 13 -- Fusible 'Stream'-oriented functions for converting between lazy 14 -- 'Text' and several common encodings. 15 16 module Data.Text.Lazy.Encoding.Fusion 17 ( 18 -- * Streaming 19 -- streamASCII 20 streamUtf8 21 , streamUtf16LE 22 , streamUtf16BE 23 , streamUtf32LE 24 , streamUtf32BE 25 26 -- * Unstreaming 27 , unstream 28 29 , module Data.Text.Encoding.Fusion.Common 30 ) where 31 32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize) 33 import qualified Data.ByteString as B 34 import qualified Data.ByteString.Unsafe as B 35 import Data.Text.Encoding.Fusion.Common 36 import Data.Text.Encoding.Error 37 import Data.Text.Fusion (Step(..), Stream(..)) 38 import Data.Text.Fusion.Size 39 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32) 40 import Data.Text.UnsafeShift (shiftL) 41 import Data.Word (Word8, Word16, Word32) 42 import qualified Data.Text.Encoding.Utf8 as U8 43 import qualified Data.Text.Encoding.Utf16 as U16 44 import qualified Data.Text.Encoding.Utf32 as U32 45 import System.IO.Unsafe (unsafePerformIO) 46 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) 47 import Foreign.Storable (pokeByteOff) 48 import Data.ByteString.Internal (mallocByteString, memcpy) 49 #if defined(ASSERTS) 50 import Control.Exception (assert) 51 #endif 52 import qualified Data.ByteString.Internal as B 53 54 data S = S0 55 | S1 {-# UNPACK #-} !Word8 56 | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 57 | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 58 | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 59 60 data T = T !ByteString !S {-# UNPACK #-} !Int 61 62 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using 63 -- UTF-8 encoding. 64 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char 65 -- entered 100 timesstreamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize 66 where 67 next (T bs@(Chunk ps _) S0 i) 68 | i < len && U8.validate1 a = 69 Yield (unsafeChr8 a) (T bs S0 (i+1)) 70 | i + 1 < len && U8.validate2 a b = 71 Yield (U8.chr2 a b) (T bs S0 (i+2)) 72 | i + 2 < len && U8.validate3 a b c = 73 Yield (U8.chr3 a b c) (T bs S0 (i+3)) 74 | i + 3 < len && U8.validate4 a b c d = 75 Yield (U8.chr4 a b c d) (T bs S0 (i+4)) 76 where len = B.length ps 77 a = B.unsafeIndex ps i 78 b = B.unsafeIndex ps (i+1) 79 c = B.unsafeIndex ps (i+2) 80 d = B.unsafeIndex ps (i+3) 81 next st@(T bs s i) = 82 case s of 83 S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es 84 S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es 85 S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es 86 S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es 87 _ -> consume st 88 where es = T bs S0 i 89 consume (T bs@(Chunk ps rest) s i) 90 | i >= B.length ps = consume (T rest s 0) 91 | otherwise = 92 case s of 93 S0 -> next (T bs (S1 x) (i+1)) 94 S1 a -> next (T bs (S2 a x) (i+1)) 95 S2 a b -> next (T bs (S3 a b x) (i+1)) 96 S3 a b c -> next (T bs (S4 a b c x) (i+1)) 97 S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a) 98 (T bs (S3 b c d) (i+1)) 99 where x = B.unsafeIndex ps i 100 consume (T Empty S0 _) = Done 101 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st 102 {-# INLINE [0] streamUtf8 #-} 103 104 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 105 -- endian UTF-16 encoding. 106 streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char 107 -- entered 100 timesstreamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize 108 where 109 next (T bs@(Chunk ps _) S0 i) 110 | i + 1 < len && U16.validate1 x1 = 111 Yield (unsafeChr x1) (T bs S0 (i+2)) 112 | i + 3 < len && U16.validate2 x1 x2 = 113 Yield (U16.chr2 x1 x2) (T bs S0 (i+4)) 114 where len = B.length ps 115 x1 = c (idx i) (idx (i + 1)) 116 x2 = c (idx (i + 2)) (idx (i + 3)) 117 c w1 w2 = w1 + (w2 `shiftL` 8) 118 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16 119 next st@(T bs s i) = 120 case s of 121 S2 w1 w2 | U16.validate1 (c w1 w2) -> 122 Yield (unsafeChr (c w1 w2)) es 123 S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) -> 124 Yield (U16.chr2 (c w1 w2) (c w3 w4)) es 125 _ -> consume st 126 where es = T bs S0 i 127 c :: Word8 -> Word8 -> Word16 128 c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8) 129 consume (T bs@(Chunk ps rest) s i) 130 | i >= B.length ps = consume (T rest s 0) 131 | otherwise = 132 case s of 133 S0 -> next (T bs (S1 x) (i+1)) 134 S1 w1 -> next (T bs (S2 w1 x) (i+1)) 135 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1)) 136 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1)) 137 S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1) 138 (T bs (S3 w2 w3 w4) (i+1)) 139 where x = B.unsafeIndex ps i 140 consume (T Empty S0 _) = Done 141 consume st = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st 142 {-# INLINE [0] streamUtf16LE #-} 143 144 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 145 -- endian UTF-16 encoding. 146 streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char 147 -- entered 100 timesstreamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize 148 where 149 next (T bs@(Chunk ps _) S0 i) 150 | i + 1 < len && U16.validate1 x1 = 151 Yield (unsafeChr x1) (T bs S0 (i+2)) 152 | i + 3 < len && U16.validate2 x1 x2 = 153 Yield (U16.chr2 x1 x2) (T bs S0 (i+4)) 154 where len = B.length ps 155 x1 = c (idx i) (idx (i + 1)) 156 x2 = c (idx (i + 2)) (idx (i + 3)) 157 c w1 w2 = (w1 `shiftL` 8) + w2 158 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16 159 next st@(T bs s i) = 160 case s of 161 S2 w1 w2 | U16.validate1 (c w1 w2) -> 162 Yield (unsafeChr (c w1 w2)) es 163 S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) -> 164 Yield (U16.chr2 (c w1 w2) (c w3 w4)) es 165 _ -> consume st 166 where es = T bs S0 i 167 c :: Word8 -> Word8 -> Word16 168 c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2 169 consume (T bs@(Chunk ps rest) s i) 170 | i >= B.length ps = consume (T rest s 0) 171 | otherwise = 172 case s of 173 S0 -> next (T bs (S1 x) (i+1)) 174 S1 w1 -> next (T bs (S2 w1 x) (i+1)) 175 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1)) 176 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1)) 177 S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1) 178 (T bs (S3 w2 w3 w4) (i+1)) 179 where x = B.unsafeIndex ps i 180 consume (T Empty S0 _) = Done 181 consume st = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st 182 {-# INLINE [0] streamUtf16BE #-} 183 184 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 185 -- endian UTF-32 encoding. 186 streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char 187 -- entered 100 timesstreamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize 188 where 189 next (T bs@(Chunk ps _) S0 i) 190 | i + 3 < len && U32.validate x = 191 Yield (unsafeChr32 x) (T bs S0 (i+4)) 192 where len = B.length ps 193 x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4 194 x1 = idx i 195 x2 = idx (i+1) 196 x3 = idx (i+2) 197 x4 = idx (i+3) 198 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32 199 next st@(T bs s i) = 200 case s of 201 S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) -> 202 Yield (unsafeChr32 (c w1 w2 w3 w4)) es 203 _ -> consume st 204 where es = T bs S0 i 205 c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32 206 c w1 w2 w3 w4 = shifted 207 where 208 shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4 209 x1 = fromIntegral w1 210 x2 = fromIntegral w2 211 x3 = fromIntegral w3 212 x4 = fromIntegral w4 213 consume (T bs@(Chunk ps rest) s i) 214 | i >= B.length ps = consume (T rest s 0) 215 | otherwise = 216 case s of 217 S0 -> next (T bs (S1 x) (i+1)) 218 S1 w1 -> next (T bs (S2 w1 x) (i+1)) 219 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1)) 220 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1)) 221 S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1) 222 (T bs (S3 w2 w3 w4) (i+1)) 223 where x = B.unsafeIndex ps i 224 consume (T Empty S0 _) = Done 225 consume st = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st 226 {-# INLINE [0] streamUtf32BE #-} 227 228 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 229 -- endian UTF-32 encoding. 230 streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char 231 -- entered 100 timesstreamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize 232 where 233 next (T bs@(Chunk ps _) S0 i) 234 | i + 3 < len && U32.validate x = 235 Yield (unsafeChr32 x) (T bs S0 (i+4)) 236 where len = B.length ps 237 x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1 238 x1 = idx i 239 x2 = idx (i+1) 240 x3 = idx (i+2) 241 x4 = idx (i+3) 242 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32 243 next st@(T bs s i) = 244 case s of 245 S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) -> 246 Yield (unsafeChr32 (c w1 w2 w3 w4)) es 247 _ -> consume st 248 where es = T bs S0 i 249 c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32 250 c w1 w2 w3 w4 = shifted 251 where 252 shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1 253 x1 = fromIntegral w1 254 x2 = fromIntegral w2 255 x3 = fromIntegral w3 256 x4 = fromIntegral w4 257 consume (T bs@(Chunk ps rest) s i) 258 | i >= B.length ps = consume (T rest s 0) 259 | otherwise = 260 case s of 261 S0 -> next (T bs (S1 x) (i+1)) 262 S1 w1 -> next (T bs (S2 w1 x) (i+1)) 263 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1)) 264 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1)) 265 S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1) 266 (T bs (S3 w2 w3 w4) (i+1)) 267 where x = B.unsafeIndex ps i 268 consume (T Empty S0 _) = Done 269 consume st = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st 270 {-# INLINE [0] streamUtf32LE #-} 271 272 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'. 273 unstreamChunks :: Int -> Stream Word8 -> ByteString 274 -- entered 200 timesunstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0) 275 where chunk s1 len1 = unsafePerformIO $ do 276 let len = max 4 (min len1 chunkSize) 277 mallocByteString len >>= loop len 0 s1 278 where 279 loop !n !off !s fp = case next s of 280 Done | off == 0 -> return Empty 281 | otherwise -> return $! Chunk (trimUp fp off) Empty 282 Skip s' -> loop n off s' fp 283 Yield x s' 284 | off == chunkSize -> do 285 let !newLen = n - off 286 return $! Chunk (trimUp fp off) (chunk s newLen) 287 | off == n -> realloc fp n off s' x 288 | otherwise -> do 289 withForeignPtr fp $ \p -> pokeByteOff p off x 290 loop n (off+1) s' fp 291 {-# NOINLINE realloc #-} 292 realloc fp n off s x = do 293 let n' = min (n+n) chunkSize 294 fp' <- copy0 fp n n' 295 withForeignPtr fp' $ \p -> pokeByteOff p off x 296 loop n' (off+1) s fp' 297 trimUp fp off = B.PS fp 0 off 298 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8) 299 copy0 !src !srcLen !destLen = 300 #if defined(ASSERTS) 301 assert (srcLen <= destLen) $ 302 #endif 303 do 304 dest <- mallocByteString destLen 305 withForeignPtr src $ \src' -> 306 withForeignPtr dest $ \dest' -> 307 memcpy dest' src' (fromIntegral srcLen) 308 return dest 309 310 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'. 311 unstream :: Stream Word8 -> ByteString 312 -- entered onceunstream = unstreamChunks defaultChunkSize 313 314 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8 315 -> s -> Step s Char 316 -- never entereddecodeError func kind onErr mb i = 317 case onErr desc mb of 318 Nothing -> Skip i 319 Just c -> Yield c i 320 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++ 321 kind ++ " stream"