前端之家收集整理的这篇文章主要介绍了
使用ICU库中正则表达式匹配关键字示例,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
#include <iostream>
//#include "unicode/unistr.h"
//#include <unicode/ucsdet.h>
#include <unicode/ucnv.h>
#include <string.h>
ifdef _DEBUG
#ifdef linux
#else
#ifdef _WIN64
#pragma comment(lib,"icuin64d.lib")
#pragma comment(lib,"icuuc64d.lib")
#else
#pragma comment(lib,"icuin32d.lib")
#pragma comment(lib,"icuuc32d.lib")
#endif
#endif
#else
#ifdef linux
#else
#ifdef _WIN64
#pragma comment(lib,"icuin64.lib")
#pragma comment(lib,"icuuc64.lib")
#else
#pragma comment(lib,"icuin32.lib")
#pragma comment(lib,"icuuc32.lib")
#endif
#endif
#endif
int32_t BUFFSIZE = 8;
int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index)
{
int32_t num = 0;
int pos = USrcStr.indexOf(USubStr);
while(pos != -1)
{
num++;
pos += index;
pos = USrcStr.indexOf(USubStr,pos);
}
return num;
}
int FindSubFromBuf(char* buff,int buflen,UnicodeString liststring)
{
if((buff == NULL) || liststring.isEmpty() || buflen == 0)
{
return 0;
}
UCharsetDetector* dector = NULL;
UErrorCode status = U_ZERO_ERROR;
UConverter *conv = NULL;
const UCharsetMatch *csm = NULL;
int num = 0;
//先检测buff里的字符编码格式
dector = ucsdet_open(&status);
if(status != U_ZERO_ERROR)
{
//std::cout<<"open charset detector Failed!\n";
ucsdet_close(dector);
return 0;
}
ucsdet_setText(dector,buff,buflen,&status);
if(status != U_ZERO_ERROR)
{
//std::cout<<"set fail!\n";
ucsdet_close(dector);
return 0;
}
csm = ucsdet_detect(dector,&status);
const char* detected = ucsdet_getName(csm,&status);
ucsdet_close(dector);
//然后转化为Unicode编码进行比较
UChar *target = NULL; //指向存储转换后的字符串的结尾
UChar *targetLimit = NULL; //指向缓冲区尾部的指针
const char *source = NULL; //指向源代码页缓冲区
const char *sourceLimit = NULL; //指向缓冲区的尾部的字节
int32_t *offset = NULL; //表示什么也不做*/
int32_t numread = 0; //实际读了多少字节
int32_t buffsize = 0;
conv = ucnv_open(detected,&status);
if(status != U_ZERO_ERROR)
{
std::cout<<"open converter Failed!\n";
ucnv_close(conv);
return 0;
}
buffsize = BUFFSIZE/ucnv_getMinCharSize(conv);
char* read = buff;
char* inbuf = new char[BUFFSIZE*sizeof(char) +1];
UChar* uBuf = new UChar[BUFFSIZE*sizeof(UChar) + 2];
memset(inbuf,BUFFSIZE*sizeof(char) + 1);
memset(uBuf,BUFFSIZE*sizeof(UChar) + 2);
UnicodeString readbuff= UnicodeString("");
UnicodeString tempUStr = UnicodeString("");
while((read-buff)<buflen)
{
memcpy(inbuf,read,BUFFSIZE);
int fin_len = buflen-(read-buff);
if (fin_len > BUFFSIZE)
{
fin_len = BUFFSIZE;
}
read = read + fin_len;
numread = strlen(inbuf);
/* UnicodeString readbuff;*/
source = inbuf;
sourceLimit = inbuf + numread;
do
{
target = uBuf;
targetLimit = uBuf + buffsize;//分食
ucnv_toUnicode(conv,&target,targetLimit,&source,sourceLimit,NULL,(read-buff == buflen)?true:false,&status);
if(status != U_ZERO_ERROR)
{
//std::cout<<"Convert fail!\n";
if (uBuf)
{
delete [] uBuf;
uBuf = NULL;
}
if (inbuf)
{
delete []inbuf;
inbuf = NULL;
}
ucnv_close(conv);
return 0;
}
}
while(source < sourceLimit);
//用uBuf初始化UnicString对象
readbuff = UnicodeString(uBuf);
int32_t readbuflen = readbuff.length();
//第一次先给midUStr赋空,防止在进行
int32_t plen = readbuff.length();
UnicodeString temp = tempUStr + readbuff;
int len = liststring.length();
num += FindSubNum(temp,liststring,len);
//每保存一段内存块的回退字节就清空一次便于下一次继续存放
//保留本次的后 len-1个字符
tempUStr = temp.tempSubString(temp.length() - (len-1),len);
memset(inbuf,BUFFSIZE+1);
memset(uBuf,BUFFSIZE*sizeof(UChar)+2);
}
if (uBuf)
{
delete [] uBuf;
uBuf = NULL;
}
if (inbuf)
{
delete []inbuf;
inbuf = NULL;
}
ucnv_close(conv);
return num;
}
原文链接:https://www.f2er.com/regex/362086.html