最常见的词语二分法:
$str = '这是我的网站www.7di.net!';
//$str = iconv('GB2312','UTF-8',$str); $result = spStr($str);
print_r($result);
/** * UTF-8版 中文二元分词
*/ function spStr($str)
{ $cstr = array();
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "
$str = str_replace($search, " ", $str);
preg_match_all("/[a-zA-Z]+/", $str, $estr);
preg_match_all("/[0-9]+/", $str, $nstr);
$str = preg_replace("/[0-9a-zA-Z]+/", " ", $str);
$str = preg_replace("/\s{2,}/", " ", $str);
$str = explode(" ", trim($str));
foreach ($str as $s) {
$l = strlen($s);
$bf = null;
for ($i= 0; $i
$ns1 = $s{$i}.$s{$i+1}.$s{$i+2};
if (isset($s{$i+3})) {
$ns2 = $s{$i+3}.$s{$i+4}.$s{$i+5};
if (preg_match("/[\x80-\xff]{3}/",$ns2)) $cstr[] = $ns1.$ns2;
} else if ($i == 0) {
$cstr[] = $ns1;
}
}
}
$estr = isset($estr[0])?$estr[0]:array();
$nstr = isset($nstr[0])?$nstr[0]:array();
return array_merge($nstr,$estr,$cstr);
} |
執行結果是:
Array ( [0] => 7 [1] => www [2] => di [3] => net [4] => 这是 [5] => 是我 [6] => 我的 [7] => 的网 [8] => 网站 ) |
接下来,将以上结果转换为区位码,PHP代码是:
2088shop商城购物系统是商城系统中功能最全的一个版本:非会员购物、商品无限级分类、不限商品数量、商品多级会员定价、上货库存、Word在线编辑器、订单详情销售报表、商品评论、留言簿、管理员多级别、VIP积分、会员注册积分奖励、智能新闻发布、滚动公告、投票调查、背景图片颜色更换、店标上传、版权联系方式修改、背景音乐(好歌不断)、广告图片支持Flash、弹出浮动广告、搜索引擎关健词优化、图文友情联
foreach ($result as $s) {
$s = iconv('UTF-8','GB2312',$s);
$code[] = gbCode($s);
} $code = implode(" ", $code);
echo $code;
function gbCode($str) {
$return = null;
if (!preg_match("/^[\x80-\xff]{2,}$/",$str)) return $str;
$len = strlen($str);
for ($i= 0; $i
$return .= sprintf("%02d%02d",ord($str{$i})-160,ord($str{$i+1})-160);
}
return $return;
} |










