1 -------------------------------------------------------------------- 2 -- | 3 -- Module : Text.Regex.PCRE.Light.Char8 4 -- Copyright: Copyright (c) 2007-2008, Don Stewart 5 -- License : BSD3 6 -- 7 -- Maintainer: Don Stewart <dons@galois.com> 8 -- Stability : experimental 9 -- Portability: H98 + FFI 10 -- 11 -------------------------------------------------------------------- 12 -- 13 -- A simple, portable binding to perl-compatible regular expressions 14 -- (PCRE) via 8-bit latin1 Strings. 15 -- 16 17 module Text.Regex.PCRE.Light.Char8 ( 18 19 -- * The abstract PCRE Regex type 20 Regex 21 22 -- * String interface 23 , compile, compileM 24 , match 25 26 -- * Regex types and constructors externally visible 27 28 -- ** PCRE compile-time bit flags 29 , PCREOption 30 31 , anchored 32 , auto_callout 33 {-, bsr_anycrlf-} 34 {-, bsr_unicode-} 35 , caseless 36 , dollar_endonly 37 , dotall 38 , dupnames 39 , extended 40 , extra 41 , firstline 42 , multiline 43 {-, newline_any-} 44 {-, newline_anycrlf-} 45 , newline_cr 46 , newline_crlf 47 , newline_lf 48 , no_auto_capture 49 , ungreedy 50 , utf8 51 , no_utf8_check 52 53 -- ** PCRE exec-time bit flags 54 , PCREExecOption 55 56 , exec_anchored 57 {-, exec_newline_any -} 58 {-, exec_newline_anycrlf -} 59 , exec_newline_cr 60 , exec_newline_crlf 61 , exec_newline_lf 62 , exec_notbol 63 , exec_noteol 64 , exec_notempty 65 , exec_no_utf8_check 66 , exec_partial 67 68 ) where 69 70 import qualified Data.ByteString.Char8 as S 71 import qualified Text.Regex.PCRE.Light as S 72 import Text.Regex.PCRE.Light hiding (match, compile, compileM) 73 74 -- | 'compile' 75 -- 76 -- Compile a perl-compatible regular expression, in a strict bytestring. 77 -- The arguments are: 78 -- 79 -- * 'pat': A ByteString, which may or may not be zero-terminated, 80 -- containing the regular expression to be compiled. 81 -- 82 -- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used. 83 -- 84 -- Valid compile-time flags are: 85 -- 86 -- * 'anchored' - Force pattern anchoring 87 -- 88 -- * 'auto_callout' - Compile automatic callouts 89 -- 90 -- * 'bsr_anycrlf' - \\R matches only CR, LF, or CRLF 91 -- 92 -- * 'bsr_unicode' - \\R matches all Unicode line endings 93 -- 94 -- * 'caseless' - Do caseless matching 95 -- 96 -- * 'dollar_endonly' - '$' not to match newline at end 97 -- 98 -- * 'dotall' - matches anything including NL 99 -- 100 -- * 'dupnames' - Allow duplicate names for subpatterns 101 -- 102 -- * 'extended' - Ignore whitespace and # comments 103 -- 104 -- * 'extra' - PCRE extra features (not much use currently) 105 -- 106 -- * 'firstline' - Force matching to be before newline 107 -- 108 -- * 'multiline' - '^' and '$' match newlines within data 109 -- 110 -- * 'newline_any' - Recognize any Unicode newline sequence 111 -- 112 -- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences 113 -- 114 -- * 'newline_cr' - Set CR as the newline sequence 115 -- 116 -- * 'newline_crlf' - Set CRLF as the newline sequence 117 -- 118 -- * 'newline_lf' - Set LF as the newline sequence 119 -- 120 -- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available) 121 -- 122 -- * 'ungreedy' - Invert greediness of quantifiers 123 -- 124 -- * 'utf8' - Run in UTF-8 mode 125 -- 126 -- * 'no_utf8_check' - Do not check the pattern for UTF-8 validity 127 -- 128 -- If compilation of the pattern fails, the 'Left' constructor is 129 -- returned with the error string. Otherwise an abstract type 130 -- representing the compiled regular expression is returned. 131 -- The regex is allocated via malloc on the C side, and will be 132 -- deallocated by the runtime when the Haskell value representing it 133 -- goes out of scope. 134 -- 135 -- As regexes are often defined statically, GHC will compile them 136 -- to null-terminated, strict C strings, enabling compilation of the 137 -- pattern without copying. This may be useful for very large patterns. 138 -- 139 -- See man pcreapi for more details. 140 -- 141 compile :: String -> [PCREOption] -> Regex 142 -- entered 535 timescompile str os = S.compile (S.pack str) os 143 {-# INLINE compile #-} 144 145 -- | 'compileM' 146 -- A generic version of 'compile' with failure lifted into an arbitrary monad. 147 compileM :: Monad m => String -> [PCREOption] -> m Regex 148 -- entered 561 timescompileM str os = S.compileM (S.pack str) os 149 {-# INLINE compileM #-} 150 151 152 -- | 'match' 153 -- 154 -- Matches a compiled regular expression against a given subject string, 155 -- using a matching algorithm that is similar to Perl's. If the subject 156 -- string doesn't match the regular expression, 'Nothing' is returned, 157 -- otherwise the portion of the string that matched is returned, along 158 -- with any captured subpatterns. 159 -- 160 -- The arguments are: 161 -- 162 -- * 'regex', a PCRE regular expression value produced by compile 163 -- 164 -- * 'subject', the subject string to match against 165 -- 166 -- * 'options', an optional set of exec-time flags to exec. 167 -- 168 -- Available runtime options are: 169 -- 170 -- * 'anchored' - Match only at the first position 171 -- 172 -- * 'bsr_anycrlf' - '\\R' matches only CR, LF, or CRLF 173 -- 174 -- * 'bsr_unicode' - '\\R' matches all Unicode line endings 175 -- 176 -- * 'newline_any' - Recognize any Unicode newline sequence 177 -- 178 -- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences 179 -- 180 -- * 'newline_cr' - Set CR as the newline sequence 181 -- 182 -- * 'newline_crlf' - Set CRLF as the newline sequence 183 -- 184 -- * 'newline_lf' - Set LF as the newline sequence 185 -- 186 -- * 'notbol' - Subject is not the beginning of a line 187 -- 188 -- * 'noteol' - Subject is not the end of a line 189 -- 190 -- * 'notempty' - An empty string is not a valid match 191 -- 192 -- * 'no_utf8_check' - Do not check the subject for UTF-8 193 -- 194 -- * 'partial' - Return PCRE_ERROR_PARTIAL for a partial match 195 -- 196 -- The result value, and any captured subpatterns, are returned. 197 -- If the regex is invalid, or the subject string is empty, Nothing 198 -- is returned. 199 -- 200 match :: Regex -> String -> [PCREExecOption] -> Maybe [String] 201 -- entered 2342 timesmatch r subject os = 202 case S.match r (S.pack subject) os of 203 Nothing -> Nothing 204 Just x -> Just (map S.unpack x) 205 {-# INLINE match #-}