利用余弦值计算字串的相似性

之前用字符串匹配的方式做过一次,感觉这个很厉害,竟然可以用余弦来做,准不准不知道,之前做过php的东西都没记录下来,现在开始记录下
参考网址:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html

    public function test($value='')
    {
        $textB = "我喜欢看电视,不喜欢看电影。";
        $textA = "我不喜欢看电视,也不喜欢看电影。";
        // 1.分词
        $scws = scws_new();
        $scws->set_charset("utf8");
        $scws->send_text($textA);

        $wordAList = $scws->get_words('~un');
        $scws->send_text($textB);
        $wordBList = $scws->get_words('~un');
        /**
        *   分词完成之后的格式
        *   [0] => array(4) {
        *       ["word"] => string(9) "我喜欢"
        *       ["times"] => int(1)
        *       ["weight"] => float(4.8200001716614)
        *       ["attr"] => string(1) "n"
        *   }
        */
        // 2.列出所有的词
        // 提取多维数组的值
        $allWord = array_unique(array_merge(array_column($wordAList, 'word'),array_column($wordBList, 'word')));
        /**
        *   字符串数组
        *   [0] => string(9) "我喜欢"
        *   [1] => string(3) "看"
        *   [2] => string(6) "电视"
        *   [3] => string(3) "不"
        *   [4] => string(6) "喜欢"
        *   [5] => string(9) "看电影"
        *   [6] => string(3) "我"
        *   [11] => string(3) "也"
        */
        // 3.计算词频向量
        $textATimes = array();// 整型数组
        $textBTimes = array();
        foreach ($allWord as $key => $value) {
            // 提取多维数组的值
            $a = array_filter($wordAList, function($arr) use ($value) {
                if($value == $arr['word'])
                {
                    return $arr['times'];
                }
                return 0; 
            });
            $textATimes[] = (Int)(current($a)['times']);
            $b = array_filter($wordBList, function($arr) use ($value) {
                if($value == $arr['word'])
                {
                    return $arr['times'];
                }
                return 0; 
            });
            $textBTimes[] = (Int)(current($b)['times']);
        }

        // 使用余弦计算
        $topNumber = 0;
        $bottomNumberA = 0;
        $bottomNumberB = 0;
        foreach ($textATimes as $key => $value) {
            $topNumber +=  $value * $textBTimes[$key];
            $bottomNumberA += $value * $value;
            $bottomNumberB += $textBTimes[$key] * $textBTimes[$key];
        }
        $res = $topNumber / (sqrt($bottomNumberA) * sqrt($bottomNumberB));
        // 余弦值越接近1,就表明夹角越接近0度,也就是两个向量越相似,这就叫"余弦相似性"
        // float(0.79259392390122)
        dump($res);
    }

标签: php

添加新评论