1 --------------------------------------------------------------------
    2 -- |
    3 -- Module   : Text.Regex.PCRE.Light.Char8
    4 -- Copyright: Copyright (c) 2007-2008, Don Stewart
    5 -- License  : BSD3
    6 --
    7 -- Maintainer:  Don Stewart <dons@galois.com>
    8 -- Stability :  experimental
    9 -- Portability: H98 + FFI
   10 --
   11 --------------------------------------------------------------------
   12 -- 
   13 -- A simple, portable binding to perl-compatible regular expressions
   14 -- (PCRE) via 8-bit latin1 Strings.
   15 --
   16 
   17 module Text.Regex.PCRE.Light.Char8 (
   18 
   19         -- * The abstract PCRE Regex type
   20           Regex
   21 
   22         -- * String interface
   23         , compile, compileM
   24         , match
   25 
   26         -- * Regex types and constructors externally visible
   27 
   28         -- ** PCRE compile-time bit flags
   29         , PCREOption
   30 
   31         , anchored
   32         , auto_callout
   33         {-, bsr_anycrlf-}
   34         {-, bsr_unicode-}
   35         , caseless
   36         , dollar_endonly
   37         , dotall
   38         , dupnames
   39         , extended
   40         , extra
   41         , firstline
   42         , multiline
   43         {-, newline_any-}
   44         {-, newline_anycrlf-}
   45         , newline_cr
   46         , newline_crlf
   47         , newline_lf
   48         , no_auto_capture
   49         , ungreedy
   50         , utf8
   51         , no_utf8_check
   52 
   53         -- ** PCRE exec-time bit flags
   54         , PCREExecOption
   55 
   56         , exec_anchored
   57         {-, exec_newline_any     -}
   58         {-, exec_newline_anycrlf -}
   59         , exec_newline_cr
   60         , exec_newline_crlf
   61         , exec_newline_lf
   62         , exec_notbol
   63         , exec_noteol
   64         , exec_notempty
   65         , exec_no_utf8_check
   66         , exec_partial
   67 
   68     ) where
   69 
   70 import qualified Data.ByteString.Char8 as S
   71 import qualified Text.Regex.PCRE.Light as S
   72 import Text.Regex.PCRE.Light hiding (match, compile, compileM)
   73 
   74 -- | 'compile'
   75 --
   76 -- Compile a perl-compatible regular expression, in a strict bytestring.
   77 -- The arguments are:
   78 --
   79 -- * 'pat': A ByteString, which may or may not be zero-terminated,
   80 -- containing the regular expression to be compiled. 
   81 --
   82 -- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
   83 --
   84 -- Valid compile-time flags are:
   85 --
   86 -- * 'anchored'        - Force pattern anchoring
   87 --
   88 -- * 'auto_callout'    - Compile automatic callouts
   89 --
   90 -- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
   91 --
   92 -- * 'bsr_unicode'     - \\R matches all Unicode line endings
   93 --
   94 -- * 'caseless'        - Do caseless matching
   95 --
   96 -- * 'dollar_endonly'  - '$' not to match newline at end
   97 --
   98 -- * 'dotall'          - matches anything including NL
   99 --
  100 -- * 'dupnames'        - Allow duplicate names for subpatterns
  101 --
  102 -- * 'extended'        - Ignore whitespace and # comments
  103 --
  104 -- * 'extra'           - PCRE extra features (not much use currently)
  105 --
  106 -- * 'firstline'       - Force matching to be  before  newline
  107 --
  108 -- * 'multiline'       - '^' and '$' match newlines within data
  109 --
  110 -- * 'newline_any'     - Recognize any Unicode newline sequence
  111 --
  112 -- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
  113 --
  114 -- * 'newline_cr'      - Set CR as the newline sequence
  115 --
  116 -- * 'newline_crlf'    - Set CRLF as the newline sequence
  117 --
  118 -- * 'newline_lf'      - Set LF as the newline sequence
  119 --
  120 -- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
  121 --
  122 -- * 'ungreedy'        - Invert greediness of quantifiers
  123 --
  124 -- * 'utf8'            - Run in UTF-8 mode
  125 --
  126 -- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
  127 --
  128 -- If compilation of the pattern fails, the 'Left' constructor is 
  129 -- returned with the error string. Otherwise an abstract type
  130 -- representing the compiled regular expression is returned.
  131 -- The regex is allocated via malloc on the C side, and will be
  132 -- deallocated by the runtime when the Haskell value representing it
  133 -- goes out of scope.
  134 --
  135 -- As regexes are often defined statically, GHC will compile them 
  136 -- to null-terminated, strict C strings, enabling compilation of the 
  137 -- pattern without copying. This may be useful for very large patterns.
  138 --
  139 -- See man pcreapi for more details.
  140 --
  141 compile :: String -> [PCREOption] -> Regex
  142 -- entered 535 timescompile str os = S.compile (S.pack str) os
  143 {-# INLINE compile #-}
  144 
  145 -- | 'compileM'
  146 -- A generic version of 'compile' with failure lifted into an arbitrary monad.
  147 compileM :: Monad m => String -> [PCREOption] -> m Regex
  148 -- entered 561 timescompileM str os = S.compileM (S.pack str) os
  149 {-# INLINE compileM #-}
  150 
  151 
  152 -- | 'match'
  153 --
  154 -- Matches a compiled regular expression against a given subject string,
  155 -- using a matching algorithm that is similar to Perl's. If the subject
  156 -- string doesn't match the regular expression, 'Nothing' is returned,
  157 -- otherwise the portion of the string that matched is returned, along
  158 -- with any captured subpatterns.
  159 --
  160 -- The arguments are:
  161 --
  162 -- * 'regex', a PCRE regular expression value produced by compile
  163 --
  164 -- * 'subject', the subject string to match against
  165 --
  166 -- * 'options', an optional set of exec-time flags to exec.
  167 --
  168 -- Available runtime options are:
  169 --
  170 -- * 'anchored'        - Match only at the first position
  171 --
  172 -- * 'bsr_anycrlf'     - '\\R' matches only CR, LF, or CRLF
  173 --
  174 -- * 'bsr_unicode'     - '\\R' matches all Unicode line endings
  175 --
  176 -- * 'newline_any'     - Recognize any Unicode newline sequence
  177 --
  178 -- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
  179 --
  180 -- * 'newline_cr'      - Set CR as the newline sequence
  181 --
  182 -- * 'newline_crlf'    - Set CRLF as the newline sequence
  183 --
  184 -- * 'newline_lf'      - Set LF as the newline sequence
  185 --
  186 -- * 'notbol'          - Subject is not the beginning of a line
  187 --
  188 -- * 'noteol'          - Subject is not the end of a line
  189 --
  190 -- * 'notempty'        - An empty string is not a valid match
  191 --
  192 -- * 'no_utf8_check'   - Do not check the subject for UTF-8
  193 --
  194 -- * 'partial'         - Return PCRE_ERROR_PARTIAL for a partial match
  195 --
  196 -- The result value, and any captured subpatterns, are returned.
  197 -- If the regex is invalid, or the subject string is empty, Nothing
  198 -- is returned.
  199 --
  200 match :: Regex -> String -> [PCREExecOption] -> Maybe [String]
  201 -- entered 2342 timesmatch r subject os =
  202     case S.match r (S.pack subject) os of
  203            Nothing -> Nothing
  204            Just x  -> Just (map S.unpack x)
  205 {-# INLINE match #-}