.NET下文本相似度算法余弦定理和SimHash浅析及应用实例分析

2019-05-23 06:12:16刘景俊

                       
                return (float)Math.Sqrt(sum);
            }
        }
 
        private IDictionary _wordsIndex=new Hashtable() ;
 
        public TFIDFMeasure(string[] documents)
        {
            _docs=documents;
            _numDocs=documents.Length ;
            MyInit();
        }
 
        private void GeneratNgramText()
        {
           
        }
 
        private ArrayList GenerateTerms(string[] docs)
        {
            ArrayList uniques=new ArrayList() ;
            _ngramDoc=new string[_numDocs][] ;
            for (int i=0; i < docs.Length ; i++)
            {
                Tokeniser tokenizer=new Tokeniser() ;
                string[] words=tokenizer.Partition(docs[i]);           
 
                for (int j=0; j < words.Length ; j++)
                    if (!uniques.Contains(words[j]) )               
                        uniques.Add(words[j]) ;
            }
            return uniques;
        }

        private static object AddElement(IDictionary collection, object key, object newValue)