1 {-# LANGUAGE CPP #-}
    2 --------------------------------------------------------------------
    3 -- |
    4 -- Module   : Text.Regex.PCRE.Light
    5 -- Copyright: Copyright (c) 2007-2008, Don Stewart
    6 -- License  : BSD3
    7 --
    8 -- Maintainer:  Don Stewart <dons@galois.com>
    9 -- Stability :  experimental
   10 -- Portability: H98 + CPP
   11 --
   12 --------------------------------------------------------------------
   13 -- 
   14 -- A simple, portable binding to perl-compatible regular expressions
   15 -- (PCRE) via strict ByteStrings.
   16 --
   17 
   18 module Text.Regex.PCRE.Light (
   19 
   20         -- * The abstract PCRE Regex type
   21           Regex
   22 
   23         -- * ByteString interface
   24         , compile, compileM
   25         , match
   26 
   27         -- * Regex types and constructors externally visible
   28 
   29         -- ** PCRE compile-time bit flags
   30         , PCREOption
   31 
   32         , anchored
   33         , auto_callout
   34         {-, bsr_anycrlf-}
   35         {-, bsr_unicode-}
   36         , caseless
   37         , dollar_endonly
   38         , dotall
   39         , dupnames
   40         , extended
   41         , extra
   42         , firstline
   43         , multiline
   44         {-, newline_any-}
   45         {-, newline_anycrlf-}
   46         , newline_cr
   47         , newline_crlf
   48         , newline_lf
   49         , no_auto_capture
   50         , ungreedy
   51         , utf8
   52         , no_utf8_check
   53 
   54         -- ** PCRE exec-time bit flags
   55         , PCREExecOption
   56 
   57         , exec_anchored
   58         {-, exec_newline_any     -}
   59         {-, exec_newline_anycrlf -}
   60         , exec_newline_cr
   61         , exec_newline_crlf
   62         , exec_newline_lf
   63         , exec_notbol
   64         , exec_noteol
   65         , exec_notempty
   66         , exec_no_utf8_check
   67         , exec_partial
   68 
   69     ) where
   70 
   71 import Text.Regex.PCRE.Light.Base
   72 
   73 -- Strings
   74 import qualified Data.ByteString          as S
   75 
   76 #if __GLASGOW_HASKELL__ >= 608
   77 import qualified Data.ByteString.Internal as S
   78 import qualified Data.ByteString.Unsafe   as S
   79 #else
   80 import qualified Data.ByteString.Base     as S
   81 #endif
   82 
   83 import Control.Monad
   84 
   85 -- Foreigns
   86 import Foreign
   87 import Foreign.Ptr
   88 import Foreign.C.Types
   89 import Foreign.C.String
   90 import Foreign.Storable
   91 import Foreign.Marshal.Alloc
   92 
   93 -- | 'compile'
   94 --
   95 -- Compile a perl-compatible regular expression stored in a strict bytestring.
   96 --
   97 -- An example
   98 --
   99 -- > let r = compile (pack "^(b+|a){1,2}?bc") []
  100 --
  101 -- Or using GHC's -XOverloadedStrings flag, and importing
  102 -- Data.ByteString.Char8, we can avoid the pack:
  103 --
  104 -- > let r = compile "^(b+|a){1,2}?bc" []
  105 --
  106 -- If the regular expression is invalid, an exception is thrown.
  107 -- If this is unsuitable, 'compileM' is availlable, which returns failure 
  108 -- in a monad.
  109 --
  110 -- To do case insentive matching,
  111 --
  112 -- > compile "^(b+|a){1,2}?bc" [caseless]
  113 --
  114 -- Other flags are documented below.
  115 --
  116 -- The resulting abstract regular expression can be passed to 'match'
  117 -- for matching against a subject string.
  118 --
  119 -- The arguments are:
  120 --
  121 -- * 'pat': A ByteString containing the regular expression to be compiled. 
  122 --
  123 -- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
  124 --
  125 -- Valid compile-time flags are:
  126 --
  127 -- * 'anchored'        - Force pattern anchoring
  128 --
  129 -- * 'auto_callout'    - Compile automatic callouts
  130 --
  131 -- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
  132 --
  133 -- * 'bsr_unicode'     - \\R matches all Unicode line endings
  134 --
  135 -- * 'caseless'        - Do caseless matching
  136 --
  137 -- * 'dollar_endonly'  - '$' not to match newline at end
  138 --
  139 -- * 'dotall'          - matches anything including NL
  140 --
  141 -- * 'dupnames'        - Allow duplicate names for subpatterns
  142 --
  143 -- * 'extended'        - Ignore whitespace and # comments
  144 --
  145 -- * 'extra'           - PCRE extra features (not much use currently)
  146 --
  147 -- * 'firstline'       - Force matching to be  before  newline
  148 --
  149 -- * 'multiline'       - '^' and '$' match newlines within data
  150 --
  151 -- * 'newline_any'     - Recognize any Unicode newline sequence
  152 --
  153 -- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
  154 --
  155 -- * 'newline_cr'      - Set CR as the newline sequence
  156 --
  157 -- * 'newline_crlf'    - Set CRLF as the newline sequence
  158 --
  159 -- * 'newline_lf'      - Set LF as the newline sequence
  160 --
  161 -- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
  162 --
  163 -- * 'ungreedy'        - Invert greediness of quantifiers
  164 --
  165 -- * 'utf8'            - Run in UTF-8 mode
  166 --
  167 -- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
  168 --
  169 -- The regex is allocated via malloc on the C side, and will be
  170 -- deallocated by the runtime when the Haskell value representing it
  171 -- goes out of scope.
  172 --
  173 -- See 'man pcreapi for more details.
  174 --
  175 -- Caveats: patterns with embedded nulls, such as "\0*" seem to be
  176 -- mishandled, as this won't currently match the subject "\0\0\0".
  177 --
  178 compile :: S.ByteString -> [PCREOption] -> Regex
  179 -- entered 1071 timescompile s o = case compileM s o of
  180     Just  r -> r
  181     Nothing -> error ("Text.Regex.PCRE.Light: Error in regex: " ++ show s)
  182 
  183 ------------------------------------------------------------------------
  184 
  185 -- | 'compileM'
  186 -- A generic version of 'compile' with failure lifted into an arbitrary monad.
  187 --
  188 -- Examples, illustrating how failure can be propagated to an IO exception, 
  189 -- or tagged as 'Nothing':
  190 --
  191 -- > > compileM ".*" [] :: Maybe Regex
  192 -- > Just (Regex 0x000000004bb5b540 ".*")
  193 --
  194 -- > > compileM "*" [] :: Maybe Regex
  195 -- > Nothing
  196 --
  197 -- > > compileM "*" [] :: IO Regex
  198 -- > *** Exception: user error (nothing to repeat)
  199 --
  200 -- > > compileM ".*" [] :: IO Regex
  201 -- > Regex 0x000000004bb5b780 ".*"
  202 --
  203 -- > > :m + Control.Monad.Error
  204 --
  205 -- > > compileM ".*" [] :: Either String Regex
  206 -- > Right (Regex 0x000000004bb5b980 ".*")
  207 --
  208 -- > > compileM "*" [] :: Either String Regex
  209 -- > Left "nothing to repeat"
  210 --
  211 compileM :: Monad m => S.ByteString -> [PCREOption] -> m Regex
  212 -- entered 2194 timescompileM str os = unsafePerformIO $
  213   S.useAsCString str $ \pattern -> do
  214     alloca $ \errptr       -> do
  215     alloca $ \erroffset    -> do
  216         pcre_ptr <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
  217         if pcre_ptr == nullPtr
  218             then do
  219                 err <- peekCString =<< peek errptr
  220                 return (fail err)
  221             else do
  222                 reg <- newForeignPtr finalizerFree pcre_ptr -- release with free()
  223                 return (return (Regex reg str))
  224 
  225 -- Possible improvements: an 'IsString' instance could be defined
  226 -- for 'Regex', which would allow the compiler to insert calls to
  227 -- 'compile' based on the type:
  228 --
  229 -- The following would be valid:
  230 --
  231 -- > match "a.*b" "abcdef" []
  232 --
  233 -- and equivalent to:
  234 --
  235 -- > match (either error id (compile "a.*b")) "abcdef" []
  236 
  237 -- | 'match'
  238 --
  239 -- Matches a compiled regular expression against a given subject string,
  240 -- using a matching algorithm that is similar to Perl's. If the subject
  241 -- string doesn't match the regular expression, 'Nothing' is returned,
  242 -- otherwise the portion of the string that matched is returned, along
  243 -- with any captured subpatterns.
  244 --
  245 -- The arguments are:
  246 --
  247 -- * 'regex', a PCRE regular expression value produced by compile
  248 --
  249 -- * 'subject', the subject string to match against
  250 --
  251 -- * 'options', an optional set of exec-time flags to exec.
  252 --
  253 -- Available runtime options are:
  254 --
  255 -- * 'exec_anchored'        - Match only at the first position
  256 --
  257 -- * 'exec_newline_any'     - Recognize any Unicode newline sequence
  258 --
  259 -- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
  260 --
  261 -- * 'exec_newline_cr'      - Set CR as the newline sequence
  262 --
  263 -- * 'exec_newline_crlf'    - Set CRLF as the newline sequence
  264 --
  265 -- * 'exec_newline_lf'      - Set LF as the newline sequence
  266 --
  267 -- * 'exec_notbol'          - Subject is not the beginning of a line
  268 --
  269 -- * 'exec_noteol'          - Subject is not the end of a line
  270 --
  271 -- * 'exec_notempty'        - An empty string is not a valid match
  272 --
  273 -- * 'exec_no_utf8_check'   - Do not check the subject for UTF-8
  274 --
  275 -- * 'exec_partial'         - Return PCRE_ERROR_PARTIAL for a partial match
  276 --
  277 -- The result value, and any captured subpatterns, are returned.
  278 -- If the regex is invalid, or the subject string is empty, Nothing
  279 -- is returned.
  280 --
  281 match :: Regex -> S.ByteString -> [PCREExecOption] -> Maybe [S.ByteString]
  282 -- entered 4684 timesmatch (Regex pcre_fp _) subject os = unsafePerformIO $ do
  283   withForeignPtr pcre_fp $ \pcre_ptr -> do
  284     n_capt <- capturedCount pcre_ptr
  285 
  286     -- The smallest  size  for ovector that will allow for n captured
  287     -- substrings, in addition to the offsets  of  the  substring
  288     -- matched by the whole pattern, is (n+1)*3. (man pcreapi)
  289 
  290     let ovec_size = (n_capt + 1) * 3
  291         ovec_bytes = ovec_size * size_of_cint
  292 
  293     allocaBytes ovec_bytes $ \ovec -> do
  294 
  295         let (str_fp, off, len) = S.toForeignPtr subject
  296         withForeignPtr str_fp $ \cstr -> do
  297             r <- c_pcre_exec
  298                          pcre_ptr
  299                          nullPtr
  300                          (cstr `plusPtr` off) -- may contain binary zero bytes.
  301                          (fromIntegral len)
  302                          0
  303                          (combineExecOptions os)
  304                          ovec
  305                          (fromIntegral ovec_size)
  306 
  307             if r < 0 -- errors, or error_no_match
  308                 then return Nothing
  309                 else let loop n o acc =
  310                             if n == r
  311                               then return (Just (reverse acc))
  312                               else do
  313                                     i <- peekElemOff ovec $! o
  314                                     j <- peekElemOff ovec (o+1)
  315                                     let s = substring i j subject
  316                                     s `seq` loop (n+1) (o+2) (s : acc)
  317                      in loop 0 0 []
  318 
  319     -- The  first  two-thirds  of ovec is used to pass back captured
  320     -- substrings When  a  match  is  successful, information about captured
  321     -- substrings is returned in pairs of integers,  starting  at the
  322     -- beginning of ovector, and continuing up to two-thirds of its length at
  323     -- the most.  The first pair, ovector[0] and ovector[1], identify the
  324     -- portion of the subject string matched  by  the entire pattern.  The next
  325     -- pair is used for the first capturing subpattern,  and  so on.  The
  326     -- value returned  by pcre_exec() is one more than the highest num- bered
  327     -- pair that has been set. For  example,  if  two  sub- strings  have been
  328     -- captured, the returned value is 3. 
  329 
  330   where
  331     -- The first element of a pair is set  to  the offset of the first
  332     -- character in a substring, and the second is set to the offset of the
  333     -- first character after  the  end of a substring.
  334     substring :: CInt -> CInt -> S.ByteString -> S.ByteString
  335     substring x y _ | x == y = S.empty -- XXX an unset subpattern
  336     substring a b s = end -- note that we're not checking...
  337         where
  338             start = S.unsafeDrop (fromIntegral a) s
  339             end   = S.unsafeTake (fromIntegral (b-a)) start
  340 
  341     -- use pcre_info to work out how many substrings to reserve space for
  342     capturedCount :: Ptr PCRE -> IO Int
  343     capturedCount regex_ptr =
  344         alloca $ \n_ptr -> do -- (st :: Ptr CInt)
  345              c_pcre_fullinfo regex_ptr nullPtr info_capturecount n_ptr
  346              return . fromIntegral =<< peek (n_ptr :: Ptr CInt)