1 {-# LANGUAGE BangPatterns, CPP, Rank2Types #-} 2 3 -- | 4 -- Module : Data.Text.Encoding.Fusion 5 -- Copyright : (c) Tom Harper 2008-2009, 6 -- (c) Bryan O'Sullivan 2009, 7 -- (c) Duncan Coutts 2009 8 -- 9 -- License : BSD-style 10 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 -- duncan@haskell.org 12 -- Stability : experimental 13 -- Portability : portable 14 -- 15 -- Fusible 'Stream'-oriented functions for converting between 'Text' 16 -- and several common encodings. 17 18 module Data.Text.Encoding.Fusion 19 ( 20 -- * Streaming 21 streamASCII 22 , streamUtf8 23 , streamUtf16LE 24 , streamUtf16BE 25 , streamUtf32LE 26 , streamUtf32BE 27 28 -- * Unstreaming 29 , unstream 30 31 , module Data.Text.Encoding.Fusion.Common 32 ) where 33 34 #if defined(ASSERTS) 35 import Control.Exception (assert) 36 #endif 37 import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy) 38 import Data.Text.Fusion (Step(..), Stream(..)) 39 import Data.Text.Fusion.Size 40 import Data.Text.Encoding.Error 41 import Data.Text.Encoding.Fusion.Common 42 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32) 43 import Data.Text.UnsafeShift (shiftL, shiftR) 44 import Data.Word (Word8, Word16, Word32) 45 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) 46 import Foreign.Storable (pokeByteOff) 47 import System.IO.Unsafe (unsafePerformIO) 48 import qualified Data.ByteString as B 49 import qualified Data.ByteString.Unsafe as B 50 import qualified Data.Text.Encoding.Utf8 as U8 51 import qualified Data.Text.Encoding.Utf16 as U16 52 import qualified Data.Text.Encoding.Utf32 as U32 53 54 streamASCII :: ByteString -> Stream Char 55 -- entered 182 timesstreamASCII bs = Stream next 0 (maxSize l) 56 where 57 l = B.length bs 58 {-# INLINE next #-} 59 next i 60 | i >= l = Done 61 | otherwise = Yield (unsafeChr8 x1) (i+1) 62 where 63 x1 = B.unsafeIndex bs i 64 {-# INLINE [0] streamASCII #-} 65 66 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8 67 -- encoding. 68 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char 69 -- entered 200 timesstreamUtf8 onErr bs = Stream next 0 (maxSize l) 70 where 71 l = B.length bs 72 next i 73 | i >= l = Done 74 | U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1) 75 | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2) 76 | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3) 77 | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4) 78 | otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1) 79 where 80 x1 = idx i 81 x2 = idx (i + 1) 82 x3 = idx (i + 2) 83 x4 = idx (i + 3) 84 idx = B.unsafeIndex bs 85 {-# INLINE [0] streamUtf8 #-} 86 87 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 88 -- endian UTF-16 encoding. 89 streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char 90 -- entered 100 timesstreamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) 91 where 92 l = B.length bs 93 {-# INLINE next #-} 94 next i 95 | i >= l = Done 96 | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) 97 | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) 98 | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1) 99 where 100 x1 = idx i + (idx (i + 1) `shiftL` 8) 101 x2 = idx (i + 2) + (idx (i + 3) `shiftL` 8) 102 idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16 103 {-# INLINE [0] streamUtf16LE #-} 104 105 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 106 -- endian UTF-16 encoding. 107 streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char 108 -- entered 100 timesstreamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) 109 where 110 l = B.length bs 111 {-# INLINE next #-} 112 next i 113 | i >= l = Done 114 | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) 115 | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) 116 | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1) 117 where 118 x1 = (idx i `shiftL` 8) + idx (i + 1) 119 x2 = (idx (i + 2) `shiftL` 8) + idx (i + 3) 120 idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16 121 {-# INLINE [0] streamUtf16BE #-} 122 123 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 124 -- endian UTF-32 encoding. 125 streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char 126 -- entered 100 timesstreamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2)) 127 where 128 l = B.length bs 129 {-# INLINE next #-} 130 next i 131 | i >= l = Done 132 | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4) 133 | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1) 134 where 135 x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4 136 x1 = idx i 137 x2 = idx (i+1) 138 x3 = idx (i+2) 139 x4 = idx (i+3) 140 idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32 141 {-# INLINE [0] streamUtf32BE #-} 142 143 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 144 -- endian UTF-32 encoding. 145 streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char 146 -- entered 100 timesstreamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2)) 147 where 148 l = B.length bs 149 {-# INLINE next #-} 150 next i 151 | i >= l = Done 152 | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4) 153 | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1) 154 where 155 x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1 156 x1 = idx i 157 x2 = idx $ i+1 158 x3 = idx $ i+2 159 x4 = idx $ i+3 160 idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32 161 {-# INLINE [0] streamUtf32LE #-} 162 163 -- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'. 164 unstream :: Stream Word8 -> ByteString 165 -- entered 981 timesunstream (Stream next s0 len) = unsafePerformIO $ do 166 let mlen = upperBound 4 len 167 mallocByteString mlen >>= loop mlen 0 s0 168 where 169 loop !n !off !s fp = case next s of 170 Done -> trimUp fp n off 171 Skip s' -> loop n off s' fp 172 Yield x s' 173 | off == n -> realloc fp n off s' x 174 | otherwise -> do 175 withForeignPtr fp $ \p -> pokeByteOff p off x 176 loop n (off+1) s' fp 177 {-# NOINLINE realloc #-} 178 realloc fp n off s x = do 179 let n' = n+n 180 fp' <- copy0 fp n n' 181 withForeignPtr fp' $ \p -> pokeByteOff p off x 182 loop n' (off+1) s fp' 183 {-# NOINLINE trimUp #-} 184 trimUp fp _ off = return $! PS fp 0 off 185 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8) 186 copy0 !src !srcLen !destLen = 187 #if defined(ASSERTS) 188 assert (srcLen <= destLen) $ 189 #endif 190 do 191 dest <- mallocByteString destLen 192 withForeignPtr src $ \src' -> 193 withForeignPtr dest $ \dest' -> 194 memcpy dest' src' (fromIntegral srcLen) 195 return dest 196 197 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8 198 -> s -> Step s Char 199 -- entered 758 timesdecodeError func kind onErr mb i = 200 case onErr desc mb of 201 Nothing -> Skip i 202 Just c -> Yield c i 203 where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++ 204 kind ++ " stream"