1 {-# LANGUAGE BangPatterns, CPP #-}
    2 -- |
    3 -- Module      : Data.Text.Foreign
    4 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
    5 --
    6 -- License     : BSD-style
    7 -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
    8 --               duncan@haskell.org
    9 -- Stability   : experimental
   10 -- Portability : GHC
   11 --
   12 -- Support for using 'Text' data with native code via the Haskell
   13 -- foreign function interface.
   14 
   15 module Data.Text.Foreign
   16     (
   17     -- * Interoperability with native code
   18     -- $interop
   19 
   20     -- * Safe conversion functions
   21       fromPtr
   22     , useAsPtr
   23     -- * Unsafe conversion code
   24     , lengthWord16
   25     , unsafeCopyToPtr
   26     -- * Low-level manipulation
   27     -- $lowlevel
   28     , dropWord16
   29     , takeWord16
   30     ) where
   31 
   32 #if defined(ASSERTS)
   33 import Control.Exception (assert)
   34 #endif
   35 import Control.Monad.ST (unsafeIOToST)
   36 import Data.Text.Internal (Text(..), empty)
   37 import Data.Text.Unsafe (lengthWord16)
   38 import qualified Data.Text.Array as A
   39 import Data.Word (Word16)
   40 import Foreign.Marshal.Alloc (allocaBytes)
   41 import Foreign.Ptr (Ptr, castPtr, plusPtr)
   42 import Foreign.Storable (peek, poke)
   43 
   44 -- $interop
   45 --
   46 -- The 'Text' type is implemented using arrays that are not guaranteed
   47 -- to have a fixed address in the Haskell heap. All communication with
   48 -- native code must thus occur by copying data back and forth.
   49 --
   50 -- The 'Text' type's internal representation is UTF-16, using the
   51 -- platform's native endianness.  This makes copied data suitable for
   52 -- use with native libraries that use a similar representation, such
   53 -- as ICU.  To interoperate with native libraries that use different
   54 -- internal representations, such as UTF-8 or UTF-32, consider using
   55 -- the functions in the 'Data.Text.Encoding' module.
   56 
   57 -- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
   58 -- contents of the array.
   59 fromPtr :: Ptr Word16           -- ^ source array
   60         -> Int                  -- ^ length of source array (in 'Word16' units)
   61         -> IO Text
   62 -- entered 100 timesfromPtr _   0   = return empty
   63 fromPtr ptr len =
   64 #if defined(ASSERTS)
   65     assert (len > 0) $
   66 #endif
   67     return $! Text arr 0 len
   68   where
   69     arr = A.run (A.unsafeNew len >>= copy)
   70     copy marr = loop ptr 0
   71       where
   72         loop !p !i | i == len = return marr
   73                    | otherwise = do
   74           A.unsafeWrite marr i =<< unsafeIOToST (peek p)
   75           loop (p `plusPtr` 2) (i + 1)
   76 
   77 -- $lowlevel
   78 --
   79 -- Foreign functions that use UTF-16 internally may return indices in
   80 -- units of 'Word16' instead of characters.  These functions may
   81 -- safely be used with such indices, as they will adjust offsets if
   82 -- necessary to preserve the validity of a Unicode string.
   83 
   84 -- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
   85 -- length.
   86 --
   87 -- If @n@ would cause the 'Text' to end inside a surrogate pair, the
   88 -- end of the prefix will be advanced by one additional 'Word16' unit
   89 -- to maintain its validity.
   90 takeWord16 :: Int -> Text -> Text
   91 -- entered 200 timestakeWord16 n t@(Text arr off len)
   92     | n <= 0               = empty
   93     | n >= len || m >= len = t
   94     | otherwise            = Text arr off m
   95   where
   96     m | w < 0xDB00 || w > 0xD8FF = n
   97       | otherwise                = n+1
   98     w = A.unsafeIndex arr (off+n-1)
   99 
  100 -- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
  101 -- dropped from its beginning.
  102 --
  103 -- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
  104 -- beginning of the suffix will be advanced by one additional 'Word16'
  105 -- unit to maintain its validity.
  106 dropWord16 :: Int -> Text -> Text
  107 -- entered 200 timesdropWord16 n t@(Text arr off len)
  108     | n <= 0               = t
  109     | n >= len || m >= len = empty
  110     | otherwise            = Text arr (off+m) (len-m)
  111   where
  112     m | w < 0xD800 || w > 0xDBFF = n
  113       | otherwise                = n+1
  114     w = A.unsafeIndex arr (off+n-1)
  115 
  116 -- | /O(n)/ Copy a 'Text' to an array.  The array is assumed to be big
  117 -- enough to hold the contents of the entire 'Text'.
  118 unsafeCopyToPtr :: Text -> Ptr Word16 -> IO ()
  119 -- entered 100 timesunsafeCopyToPtr (Text arr off len) ptr = loop ptr off
  120   where
  121     end = off + len
  122     loop !p !i | i == end  = return ()
  123                | otherwise = do
  124       poke p (A.unsafeIndex arr i)
  125       loop (p `plusPtr` 2) (i + 1)
  126 
  127 -- | /O(n)/ Perform an action on a temporary, mutable copy of a
  128 -- 'Text'.  The copy is freed as soon as the action returns.
  129 useAsPtr :: Text -> (Ptr Word16 -> Int -> IO a) -> IO a
  130 -- entered 100 timesuseAsPtr t@(Text _arr _off len) action =
  131     allocaBytes (len * 2) $ \buf -> do
  132       unsafeCopyToPtr t buf
  133       action (castPtr buf) len