.NET下文本相似度算法余弦定理和SimHash浅析及应用实例分析

2019-05-23 06:12:16刘景俊

        }
       
        private IDictionary GetWordFrequency(string input)
        {
            string convertedInput=input.ToLower() ;
            Tokeniser tokenizer=new Tokeniser() ;
            String[] words=tokenizer.Partition(convertedInput);
            Array.Sort(words);
           
            String[] distinctWords=GetDistinctWords(words);
                       
            IDictionary result=new Hashtable();
            for (int i=0; i < distinctWords.Length; i++)
            {
                object tmp;
                tmp=CountWords(distinctWords[i], words);
                result[distinctWords[i]]=tmp;
            }
            return result;
        }               
               
        private string[] GetDistinctWords(String[] input)
        {               
            if (input == null)           
                return new string[0];           
            else
            {
                ArrayList list=new ArrayList() ;