1 {-# LANGUAGE CPP, MagicHash #-}
    2 
    3 -- |
    4 -- Module      : Data.Text.Encoding.Utf8
    5 -- Copyright   : (c) 2008, 2009 Tom Harper,
    6 --               (c) 2009, 2010 Bryan O'Sullivan,
    7 --               (c) 2009 Duncan Coutts
    8 --
    9 -- License     : BSD-style
   10 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
   11 --               duncan@haskell.org
   12 -- Stability   : experimental
   13 -- Portability : GHC
   14 --
   15 -- Basic UTF-8 validation and character manipulation.
   16 module Data.Text.Encoding.Utf8
   17     (
   18     -- Decomposition
   19       ord2
   20     , ord3
   21     , ord4
   22     -- Construction
   23     , chr2
   24     , chr3
   25     , chr4
   26     -- * Validation
   27     , validate1
   28     , validate2
   29     , validate3
   30     , validate4
   31     ) where
   32 
   33 #if defined(ASSERTS)
   34 import Control.Exception (assert)
   35 #endif
   36 import Data.Bits ((.&.))
   37 import Data.Text.UnsafeChar (ord)
   38 import Data.Text.UnsafeShift (shiftR)
   39 import GHC.Exts
   40 import GHC.Word (Word8(..))
   41 
   42 default(Int)
   43 
   44 between :: Word8                -- ^ byte to check
   45         -> Word8                -- ^ lower bound
   46         -> Word8                -- ^ upper bound
   47         -> Bool
   48 -- entered 13,569 timesbetween x y z = x >= y && x <= z
   49 {-# INLINE between #-}
   50 
   51 ord2   :: Char -> (Word8,Word8)
   52 -- entered 29 timesord2 c =
   53 #if defined(ASSERTS)
   54     assert (n >= 0x80 && n <= 0x07ff)
   55 #endif
   56     (x1,x2)
   57     where
   58       n  = ord c
   59       x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
   60       x2 = fromIntegral $ (n .&. 0x3F)   + 0x80
   61 
   62 ord3   :: Char -> (Word8,Word8,Word8)
   63 -- entered 997 timesord3 c =
   64 #if defined(ASSERTS)
   65     assert (n >= 0x0800 && n <= 0xffff)
   66 #endif
   67     (x1,x2,x3)
   68     where
   69       n  = ord c
   70       x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
   71       x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
   72       x3 = fromIntegral $ (n .&. 0x3F) + 0x80
   73 
   74 ord4   :: Char -> (Word8,Word8,Word8,Word8)
   75 -- entered 3115 timesord4 c =
   76 #if defined(ASSERTS)
   77     assert (n >= 0x10000)
   78 #endif
   79     (x1,x2,x3,x4)
   80     where
   81       n  = ord c
   82       x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
   83       x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
   84       x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
   85       x4 = fromIntegral $ (n .&. 0x3F) + 0x80
   86 
   87 chr2       :: Word8 -> Word8 -> Char
   88 -- entered 74 timeschr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
   89     where
   90       !y1# = word2Int# x1#
   91       !y2# = word2Int# x2#
   92       !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
   93       !z2# = y2# -# 0x80#
   94 {-# INLINE chr2 #-}
   95 
   96 chr3          :: Word8 -> Word8 -> Word8 -> Char
   97 -- entered 1000 timeschr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
   98     where
   99       !y1# = word2Int# x1#
  100       !y2# = word2Int# x2#
  101       !y3# = word2Int# x3#
  102       !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
  103       !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
  104       !z3# = y3# -# 0x80#
  105 {-# INLINE chr3 #-}
  106 
  107 chr4             :: Word8 -> Word8 -> Word8 -> Word8 -> Char
  108 -- entered 3116 timeschr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
  109     C# (chr# (z1# +# z2# +# z3# +# z4#))
  110     where
  111       !y1# = word2Int# x1#
  112       !y2# = word2Int# x2#
  113       !y3# = word2Int# x3#
  114       !y4# = word2Int# x4#
  115       !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
  116       !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
  117       !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
  118       !z4# = y4# -# 0x80#
  119 {-# INLINE chr4 #-}
  120 
  121 validate1    :: Word8 -> Bool
  122 -- entered 6866 timesvalidate1 x1 = x1 <= 0x7F
  123 {-# INLINE validate1 #-}
  124 
  125 validate2       :: Word8 -> Word8 -> Bool
  126 -- entered 4929 timesvalidate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
  127 {-# INLINE validate2 #-}
  128 
  129 validate3          :: Word8 -> Word8 -> Word8 -> Bool
  130 {-# INLINE validate3 #-}
  131 -- entered 4825 timesvalidate3 x1 x2 x3 = validate3_1 ||
  132                      validate3_2 ||
  133                      validate3_3 ||
  134                      validate3_4
  135   where
  136     validate3_1 = (x1 == 0xE0) &&
  137                   between x2 0xA0 0xBF &&
  138                   between x3 0x80 0xBF
  139     validate3_2 = between x1 0xE1 0xEC &&
  140                   between x2 0x80 0xBF &&
  141                   between x3 0x80 0xBF
  142     validate3_3 = x1 == 0xED &&
  143                   between x2 0x80 0x9F &&
  144                   between x3 0x80 0xBF
  145     validate3_4 = between x1 0xEE 0xEF &&
  146                   between x2 0x80 0xBF &&
  147                   between x3 0x80 0xBF
  148 
  149 validate4             :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
  150 {-# INLINE validate4 #-}
  151 -- entered 3799 timesvalidate4 x1 x2 x3 x4 = validate4_1 ||
  152                         validate4_2 ||
  153                         validate4_3
  154   where 
  155     validate4_1 = x1 == 0xF0 &&
  156                   between x2 0x90 0xBF &&
  157                   between x3 0x80 0xBF &&
  158                   between x4 0x80 0xBF
  159     validate4_2 = between x1 0xF1 0xF3 &&
  160                   between x2 0x80 0xBF &&
  161                   between x3 0x80 0xBF &&
  162                   between x4 0x80 0xBF
  163     validate4_3 = x1 == 0xF4 &&
  164                   between x2 0x80 0x8F &&
  165                   between x3 0x80 0xBF &&
  166                   between x4 0x80 0xBF