This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.
enum URegexpFlag
* The contents of the pattern UText will be extracted and saved. Ownership of the * UText struct itself remains with the caller. This is to match the behavior of * uregex_open(). * * @param pattern The Regular Expression pattern to be compiled. * @param flags Flags that alter the default matching behavior for * the regular expression, UREGEX_CASE_INSENSITIVE, for * example. For default behavior, set this parameter to zero. * See enum URegexpFlag. All desired flags * are bitwise-ORed together. * @param pe Receives the position (line and column numbers) of any syntax * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives error detected by this function. * * @stable ICU 4.6 */ U_STABLE URegularExpression * U_EXPORT2 uregex_openUText(UText *pattern, uint32_t flags, UParseError *pe, UErrorCode *status); #if !UCONFIG_NO_CONVERSION /** * Open (compile) an ICU regular expression. The resulting regular expression * handle can then be used to perform various matching operations. *
* This function is the same as uregex_open, except that the pattern * is supplied as an 8 bit char * string in the default code page. * * @param pattern The Regular Expression pattern to be compiled, * NUL terminated. * @param flags Flags that alter the default matching behavior for * the regular expression, UREGEX_CASE_INSENSITIVE, for * example. For default behavior, set this parameter to zero. * See enum URegexpFlag. All desired flags * are bitwise-ORed together. * @param pe Receives the position (line and column numbers) of any syntax * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives errors detected by this function. * @return The URegularExpression object representing the compiled * pattern. * * @stable ICU 3.0 */ U_STABLE URegularExpression * U_EXPORT2 uregex_openC( const char *pattern, uint32_t flags, UParseError *pe, UErrorCode *status); #endif /** * Close the regular expression, recovering all resources (memory) it * was holding. * * @param regexp The regular expression to be closed. * @stable ICU 3.0 */ U_STABLE void U_EXPORT2 uregex_close(URegularExpression *regexp); #if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN /** * \class LocalURegularExpressionPointer * "Smart pointer" class, closes a URegularExpression via uregex_close(). * For most methods see the LocalPointerBase base class. * * @see LocalPointerBase * @see LocalPointer * @stable ICU 4.4 */ U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); U_NAMESPACE_END #endif /** * Make a copy of a compiled regular expression. Cloning a regular * expression is faster than opening a second instance from the source * form of the expression, and requires less memory. *
* Note that the current input string and the position of any matched text * within it are not cloned; only the pattern itself and the * match mode flags are copied. *
* Cloning can be particularly useful to threaded applications that perform * multiple match operations in parallel. Each concurrent RE * operation requires its own instance of a URegularExpression. * * @param regexp The compiled regular expression to be cloned. * @param status Receives indication of any errors encountered * @return the cloned copy of the compiled regular expression. * @stable ICU 3.0 */ U_STABLE URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *regexp, UErrorCode *status); /** * Returns a pointer to the source form of the pattern for this regular expression. * This function will work even if the pattern was originally specified as a UText. * * @param regexp The compiled regular expression. * @param patLength This output parameter will be set to the length of the * pattern string. A NULL pointer may be used here if the * pattern length is not needed, as would be the case if * the pattern is known in advance to be a NUL terminated * string. * @param status Receives errors detected by this function. * @return a pointer to the pattern string. The storage for the string is * owned by the regular expression object, and must not be * altered or deleted by the application. The returned string * will remain valid until the regular expression is closed. * @stable ICU 3.0 */ U_STABLE const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status); /** * Returns the source text of the pattern for this regular expression. * This function will work even if the pattern was originally specified as a UChar string. * * @param regexp The compiled regular expression. * @param status Receives errors detected by this function. * @return the pattern text. The storage for the text is owned by the regular expression * object, and must not be altered or deleted. * * @stable ICU 4.6 */ U_STABLE UText * U_EXPORT2 uregex_patternUText(const URegularExpression *regexp, UErrorCode *status); /** * Get the match mode flags that were specified when compiling this regular expression. * @param status Receives errors detected by this function. * @param regexp The compiled regular expression. * @return The match mode flags * @see URegexpFlag * @stable ICU 3.0 */ U_STABLE int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp, UErrorCode *status); /** * Set the subject text string upon which the regular expression will look for matches. * This function may be called any number of times, allowing the regular * expression pattern to be applied to different strings. *
* Regular expression matching operations work directly on the application's * string data. No copy is made. The subject string data must not be * altered after calling this function until after all regular expression * operations involving this string data are completed. *
* Zero length strings are permitted. In this case, no subsequent match * operation will dereference the text string pointer. * * @param regexp The compiled regular expression. * @param text The subject text string. * @param textLength The length of the subject text, or -1 if the string * is NUL terminated. * @param status Receives errors detected by this function. * @stable ICU 3.0 */ U_STABLE void U_EXPORT2 uregex_setText(URegularExpression *regexp, const UChar *text, int32_t textLength, UErrorCode *status); /** * Set the subject text string upon which the regular expression will look for matches. * This function may be called any number of times, allowing the regular * expression pattern to be applied to different strings. *
* Regular expression matching operations work directly on the application's * string data; only a shallow clone is made. The subject string data must not be * altered after calling this function until after all regular expression * operations involving this string data are completed. * * @param regexp The compiled regular expression. * @param text The subject text string. * @param status Receives errors detected by this function. * * @stable ICU 4.6 */ U_STABLE void U_EXPORT2 uregex_setUText(URegularExpression *regexp, UText *text, UErrorCode *status); /** * Get the subject text that is currently associated with this * regular expression object. If the input was supplied using uregex_setText(), * that pointer will be returned. Otherwise, the characters in the input will * be extracted to a buffer and returned. In either case, ownership remains * with the regular expression object. * * This function will work even if the input was originally specified as a UText. * * @param regexp The compiled regular expression. * @param textLength The length of the string is returned in this output parameter. * A NULL pointer may be used here if the * text length is not needed, as would be the case if * the text is known in advance to be a NUL terminated * string. * @param status Receives errors detected by this function. * @return Pointer to the subject text string currently associated with * this regular expression. * @stable ICU 3.0 */ U_STABLE const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status); /** * Get the subject text that is currently associated with this * regular expression object. * * This function will work even if the input was originally specified as a UChar string. * * @param regexp The compiled regular expression. * @param dest A mutable UText in which to store the current input. * If NULL, a new UText will be created as an immutable shallow clone * of the actual input string. * @param status Receives errors detected by this function. * @return The subject text currently associated with this regular expression. * If a pre-allocated UText was provided, it will always be used and returned. * * @stable ICU 4.6 */ U_STABLE UText * U_EXPORT2 uregex_getUText(URegularExpression *regexp, UText *dest, UErrorCode *status); /** * Set the subject text string upon which the regular expression is looking for matches * without changing any other aspect of the matching state. * The new and previous text strings must have the same content. * * This function is intended for use in environments where ICU is operating on * strings that may move around in memory. It provides a mechanism for notifying * ICU that the string has been relocated, and providing a new UText to access the * string in its new position. * * Note that the regular expression implementation never copies the underlying text * of a string being matched, but always operates directly on the original text * provided by the user. Refreshing simply drops the references to the old text * and replaces them with references to the new. * * Caution: this function is normally used only by very specialized * system-level code. One example use case is with garbage collection * that moves the text in memory. * * @param regexp The compiled regular expression. * @param text The new (moved) text string. * @param status Receives errors detected by this function. * * @stable ICU 4.8 */ U_STABLE void U_EXPORT2 uregex_refreshUText(URegularExpression *regexp, UText *text, UErrorCode *status); /** * Attempts to match the input string against the pattern. * To succeed, the match must extend to the end of the string, * or cover the complete match region. * * If startIndex >= zero the match operation starts at the specified * index and must extend to the end of the input string. Any region * that has been specified is reset. * * If startIndex == -1 the match must cover the input region, or the entire * input string if no region has been set. This directly corresponds to * Matcher.matches() in Java * * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or -1 * to match the input Region. * @param status Receives errors detected by this function. * @return TRUE if there is a match * @stable ICU 3.0 */ U_STABLE UBool U_EXPORT2 uregex_matches(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); /** * 64bit version of uregex_matches. * Attempts to match the input string against the pattern. * To succeed, the match must extend to the end of the string, * or cover the complete match region. * * If startIndex >= zero the match operation starts at the specified * index and must extend to the end of the input string. Any region * that has been specified is reset. * * If startIndex == -1 the match must cover the input region, or the entire * input string if no region has been set. This directly corresponds to * Matcher.matches() in Java * * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or -1 * to match the input Region. * @param status Receives errors detected by this function. * @return TRUE if there is a match * @stable ICU 4.6 */ U_STABLE UBool U_EXPORT2 uregex_matches64(URegularExpression *regexp, int64_t startIndex, UErrorCode *status); /** * Attempts to match the input string, starting from the specified index, against the pattern. * The match may be of any length, and is not required to extend to the end * of the input string. Contrast with uregex_matches(). * *
If startIndex is >= 0 any input region that was set for this * URegularExpression is reset before the operation begins. * *
If the specified starting index == -1 the match begins at the start of the input * region, or at the start of the full string if no region has been specified. * This corresponds directly with Matcher.lookingAt() in Java. * *
If the match succeeds then more information can be obtained via the * uregexp_start(), uregexp_end(), * and uregex_group() functions.
uregexp_start()
uregexp_end()
uregex_group()
uregex_start(), uregex_end()
The input string, starting from the end of the previous match and ending at * the start of the current match, is appended to the destination string. Then the * replacement string is appended to the output string, * including handling any substitutions of captured text.
A note on preflight computation of buffersize and error handling: * Calls to uregex_appendReplacement() and uregex_appendTail() are * designed to be chained, one after another, with the destination * buffer pointer and buffer capacity updated after each in preparation * to for the next. If the destination buffer is exhausted partway through such a * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal * ICU conventions are for a function to perform no action if it is * called with an error status, but for this one case, uregex_appendRepacement() * will operate normally so that buffer size computations will complete * correctly. * *
For simple, prepackaged, non-incremental find-and-replace * operations, see replaceFirst() or replaceAll().
uregex_appendTail()
uregex_appendReplacement()
uregex_appendTailUText()
uregex_appendReplacementUText()
* The behavior of this function is not very closely aligned with uregex_split(); * instead, it is based on (and implemented directly on top of) the C++ split method. * * @param regexp The compiled regular expression. * @param destFields An array of mutable UText structs to receive the results of the split. * If a field is NULL, a new UText is allocated to contain the results for * that field. This new UText is not guaranteed to be mutable. * @param destFieldsCapacity The number of elements in the destination array. * If the number of fields found is less than destCapacity, the * extra strings in the destination array are not altered. * If the number of destination strings is less than the number * of fields, the trailing part of the input string, including any * field delimiters, is placed in the last destination string. * This behavior mimics that of Perl. It is not an error condition, and no * error status is returned when all destField positions are used. * @param status A reference to a UErrorCode to receive any errors. * @return The number of fields into which the input string was split. * * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uregex_splitUText(URegularExpression *regexp, UText *destFields[], int32_t destFieldsCapacity, UErrorCode *status); /** * Set a processing time limit for match operations with this URegularExpression. * * Some patterns, when matching certain strings, can run in exponential time. * For practical purposes, the match operation may appear to be in an * infinite loop. * When a limit is set a match operation will fail with an error if the * limit is exceeded. *
* The units of the limit are steps of the match engine. * Correspondence with actual processor time will depend on the speed * of the processor and the details of the specific pattern, but will * typically be on the order of milliseconds. *
* By default, the matching time is not limited. *
* * @param regexp The compiled regular expression. * @param limit The limit value, or 0 for no limit. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_STABLE void U_EXPORT2 uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); /** * Get the time limit for for matches with this URegularExpression. * A return value of zero indicates that there is no limit. * * @param regexp The compiled regular expression. * @param status A reference to a UErrorCode to receive any errors. * @return the maximum allowed time for a match, in units of processing steps. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 uregex_getTimeLimit(const URegularExpression *regexp, UErrorCode *status); /** * Set the amount of heap storage available for use by the match backtracking stack. *
* ICU uses a backtracking regular expression engine, with the backtrack stack * maintained on the heap. This function sets the limit to the amount of memory * that can be used for this purpose. A backtracking stack overflow will * result in an error from the match operation that caused it. *
* A limit is desirable because a malicious or poorly designed pattern can use * excessive memory, potentially crashing the process. A limit is enabled * by default. *
* @param regexp The compiled regular expression. * @param limit The maximum size, in bytes, of the matching backtrack stack. * A value of zero means no limit. * The limit must be greater than or equal to zero. * @param status A reference to a UErrorCode to receive any errors. * * @stable ICU 4.0 */ U_STABLE void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); /** * Get the size of the heap storage available for use by the back tracking stack. * * @return the maximum backtracking stack size, in bytes, or zero if the * stack size is unlimited. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp, UErrorCode *status); /** * Function pointer for a regular expression matching callback function. * When set, a callback function will be called periodically during matching * operations. If the call back function returns FALSE, the matching * operation will be terminated early. * * Note: the callback function must not call other functions on this * URegularExpression. * * @param context context pointer. The callback function will be invoked * with the context specified at the time that * uregex_setMatchCallback() is called. * @param steps the accumulated processing time, in match steps, * for this matching operation. * @return TRUE to continue the matching operation. * FALSE to terminate the matching operation. * @stable ICU 4.0 */ U_CDECL_BEGIN typedef UBool U_CALLCONV URegexMatchCallback ( const void *context, int32_t steps); U_CDECL_END /** * Set a callback function for this URegularExpression. * During matching operations the function will be called periodically, * giving the application the opportunity to terminate a long-running * match. * * @param regexp The compiled regular expression. * @param callback A pointer to the user-supplied callback function. * @param context User context pointer. The value supplied at the * time the callback function is set will be saved * and passed to the callback each time that it is called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_STABLE void U_EXPORT2 uregex_setMatchCallback(URegularExpression *regexp, URegexMatchCallback *callback, const void *context, UErrorCode *status); /** * Get the callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback Out parameter, receives a pointer to the user-supplied * callback function. * @param context Out parameter, receives the user context pointer that * was set when uregex_setMatchCallback() was called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_STABLE void U_EXPORT2 uregex_getMatchCallback(const URegularExpression *regexp, URegexMatchCallback **callback, const void **context, UErrorCode *status); /** * Function pointer for a regular expression find callback function. * * When set, a callback function will be called during a find operation * and for operations that depend on find, such as findNext, split and some replace * operations like replaceFirst. * The callback will usually be called after each attempt at a match, but this is not a * guarantee that the callback will be invoked at each character. For finds where the * match engine is invoked at each character, this may be close to true, but less likely * for more optimized loops where the pattern is known to only start, and the match * engine invoked, at certain characters. * When invoked, this callback will specify the index at which a match operation is about * to be attempted, giving the application the opportunity to terminate a long-running * find operation. * * If the call back function returns FALSE, the find operation will be terminated early. * * Note: the callback function must not call other functions on this * URegularExpression * * @param context context pointer. The callback function will be invoked * with the context specified at the time that * uregex_setFindProgressCallback() is called. * @param matchIndex the next index at which a match attempt will be attempted for this * find operation. If this callback interrupts the search, this is the * index at which a find/findNext operation may be re-initiated. * @return TRUE to continue the matching operation. * FALSE to terminate the matching operation. * @stable ICU 4.6 */ U_CDECL_BEGIN typedef UBool U_CALLCONV URegexFindProgressCallback ( const void *context, int64_t matchIndex); U_CDECL_END /** * Set the find progress callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback A pointer to the user-supplied callback function. * @param context User context pointer. The value supplied at the * time the callback function is set will be saved * and passed to the callback each time that it is called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_STABLE void U_EXPORT2 uregex_setFindProgressCallback(URegularExpression *regexp, URegexFindProgressCallback *callback, const void *context, UErrorCode *status); /** * Get the find progress callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback Out parameter, receives a pointer to the user-supplied * callback function. * @param context Out parameter, receives the user context pointer that * was set when uregex_setFindProgressCallback() was called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_STABLE void U_EXPORT2 uregex_getFindProgressCallback(const URegularExpression *regexp, URegexFindProgressCallback **callback, const void **context, UErrorCode *status); #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ #endif /* UREGEX_H */