利用余弦值计算字串的相似性
之前用字符串匹配的方式做过一次,感觉这个很厉害,竟然可以用余弦来做,准不准不知道,之前做过php的东西都没记录下来,现在开始记录下
参考网址:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
public function test($value='')
{
$textB = "我喜欢看电视,不喜欢看电影。";
$textA = "我不喜欢看电视,也不喜欢看电影。";
// 1.分词
$scws = scws_new();
$scws->set_charset("utf8");
$scws->send_text($textA);
$wordAList = $scws->get_words('~un');
$scws->send_text($textB);
$wordBList = $scws->get_words('~un');
/**
* 分词完成之后的格式
* [0] => array(4) {
* ["word"] => string(9) "我喜欢"
* ["times"] => int(1)
* ["weight"] => float(4.8200001716614)
* ["attr"] => string(1) "n"
* }
*/
// 2.列出所有的词
// 提取多维数组的值
$allWord = array_unique(array_merge(array_column($wordAList, 'word'),array_column($wordBList, 'word')));
/**
* 字符串数组
* [0] => string(9) "我喜欢"
* [1] => string(3) "看"
* [2] => string(6) "电视"
* [3] => string(3) "不"
* [4] => string(6) "喜欢"
* [5] => string(9) "看电影"
* [6] => string(3) "我"
* [11] => string(3) "也"
*/
// 3.计算词频向量
$textATimes = array();// 整型数组
$textBTimes = array();
foreach ($allWord as $key => $value) {
// 提取多维数组的值
$a = array_filter($wordAList, function($arr) use ($value) {
if($value == $arr['word'])
{
return $arr['times'];
}
return 0;
});
$textATimes[] = (Int)(current($a)['times']);
$b = array_filter($wordBList, function($arr) use ($value) {
if($value == $arr['word'])
{
return $arr['times'];
}
return 0;
});
$textBTimes[] = (Int)(current($b)['times']);
}
// 使用余弦计算
$topNumber = 0;
$bottomNumberA = 0;
$bottomNumberB = 0;
foreach ($textATimes as $key => $value) {
$topNumber += $value * $textBTimes[$key];
$bottomNumberA += $value * $value;
$bottomNumberB += $textBTimes[$key] * $textBTimes[$key];
}
$res = $topNumber / (sqrt($bottomNumberA) * sqrt($bottomNumberB));
// 余弦值越接近1,就表明夹角越接近0度,也就是两个向量越相似,这就叫"余弦相似性"
// float(0.79259392390122)
dump($res);
}