* Character set detection is at best an imprecise operation. The detection * process will attempt to identify the charset that best matches the characteristics * of the byte data, but the process is partly statistical in nature, and * the results can not be guaranteed to always be correct. *
* For best accuracy in charset detection, the input data should be primarily * in a single language, and a minimum of a few hundred bytes worth of plain text * in the language are needed. The detection process will attempt to * ignore html or xml style markup that could otherwise obscure the content. *
* An alternative to the ICU Charset Detector is the * Compact Encoding Detector, https://github.com/google/compact_enc_det. * It often gives more accurate results, especially with short input samples. */ struct UCharsetDetector; /** * Structure representing a charset detector * @stable ICU 3.6 */ typedef struct UCharsetDetector UCharsetDetector; struct UCharsetMatch; /** * Opaque structure representing a match that was identified * from a charset detection operation. * @stable ICU 3.6 */ typedef struct UCharsetMatch UCharsetMatch; /** * Open a charset detector. * * @param status Any error conditions occurring during the open * operation are reported back in this variable. * @return the newly opened charset detector. * @stable ICU 3.6 */ U_STABLE UCharsetDetector * U_EXPORT2 ucsdet_open(UErrorCode *status); /** * Close a charset detector. All storage and any other resources * owned by this charset detector will be released. Failure to * close a charset detector when finished with it can result in * memory leaks in the application. * * @param ucsd The charset detector to be closed. * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_close(UCharsetDetector *ucsd); #if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN /** * \class LocalUCharsetDetectorPointer * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). * For most methods see the LocalPointerBase base class. * * @see LocalPointerBase * @see LocalPointer * @stable ICU 4.4 */ U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); U_NAMESPACE_END #endif /** * Set the input byte data whose charset is to detected. * * Ownership of the input text byte array remains with the caller. * The input string must not be altered or deleted until the charset * detector is either closed or reset to refer to different input text. * * @param ucsd the charset detector to be used. * @param textIn the input text of unknown encoding. . * @param len the length of the input text, or -1 if the text * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); /** Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * by the user from an http header or xml declaration or similar source that * can be provided as an additional hint to the charset detector. * * How and whether the declared encoding will be used during the * detection process is TBD. * * @param ucsd the charset detector to be used. * @param encoding an encoding for the current data obtained from * a header or declaration or other source outside * of the byte data itself. * @param length the length of the encoding name, or -1 if the name string * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); /** * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. *
* The returned UCharsetMatch object is owned by the UCharsetDetector. * It will remain valid until the detector input is reset, or until * the detector is closed. *
* The function will fail if *
* The returned UCharsetMatch objects are owned by the UCharsetDetector. * They will remain valid until the detector is closed or modified * *
* Return an error if *
* The state of the Charset detector that is passed in does not * affect the result of this function, but requiring a valid, open * charset detector as a parameter insures that the charset detection * service has been safely initialized and that the required detection * data is available. * *
* Note: Multiple different charset encodings in a same family may use * a single shared name in this implementation. For example, this method returns * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" * (Windows Latin 1). However, actual detection result could be "windows-1252" * when the input data matches Latin 1 code points with any points only available * in "windows-1252". * * @param ucsd a Charset detector. * @param status Any error conditions are reported back in this variable. * @return an iterator providing access to the detectable charset names. * @stable ICU 3.6 */ U_STABLE UEnumeration * U_EXPORT2 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); /** * Test whether input filtering is enabled for this charset detector. * Input filtering removes text that appears to be HTML or xml * markup from the input before applying the code page detection * heuristics. * * @param ucsd The charset detector to check. * @return TRUE if filtering is enabled. * @stable ICU 3.6 */ U_STABLE UBool U_EXPORT2 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); /** * Enable filtering of input text. If filtering is enabled, * text within angle brackets ("<" and ">") will be removed * before detection, which will remove most HTML or xml markup. * * @param ucsd the charset detector to be modified. * @param filter true to enable input text filtering. * @return The previous setting. * * @stable ICU 3.6 */ U_STABLE UBool U_EXPORT2 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); #ifndef U_HIDE_INTERNAL_API /** * Get an iterator over the set of detectable charsets - * over the charsets that are enabled by the specified charset detector. * * The returned UEnumeration provides access to the names of * the charsets. * * @param ucsd a Charset detector. * @param status Any error conditions are reported back in this variable. * @return an iterator providing access to the detectable charset names by * the specified charset detector. * @internal */ U_INTERNAL UEnumeration * U_EXPORT2 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); /** * Enable or disable individual charset encoding. * A name of charset encoding must be included in the names returned by * {@link #ucsdet_getAllDetectableCharsets()}. * * @param ucsd a Charset detector. * @param encoding encoding the name of charset encoding. * @param enabled TRUE to enable, or FALSE to disable the * charset encoding. * @param status receives the return status. When the name of charset encoding * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. * @internal */ U_INTERNAL void U_EXPORT2 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); #endif /* U_HIDE_INTERNAL_API */ #endif #endif /* __UCSDET_H */
true
TRUE
FALSE