measuring cosine similarity for multiple document n queries

Jun 23 2015 12:49 AM
static void Main(string[] args) {
  var result = getContent();
  var length = result.GetLength(0);
  double[] doc1Array = new double[length];
  double[] doc2Array = new double[length]; //first doc 
  for (int i = 0; i < length; i++) {
    doc1Array[i] = result[i, 0];
  } //second doc 
  for (int i = 0; i < length; i++) {
    doc2Array[i] = result[i, 1];
  }
  var cosSimilarity = CalculateCosineSimilarity(doc1Array, doc2Array);
  Console.WriteLine("Similarity between Query and Doc2: ");
  Console.WriteLine(cosSimilarity);
  Console.ReadKey();
}
private static double
public static double[, ] GetWeights(List < List < string >> splitedDocuments, string[] unicalWords) {
  double[, ] matrix = new double[unicalWords.Count(), splitedDocuments.Count];
  for (int i = 0; i < splitedDocuments.Count; i++) {
    var document = splitedDocuments[i].GroupBy(g => g).Select(s => new {
      s.Key, s.ToList().Count
    }).ToDictionary(t => t.Key, t => t.Count);
    for (int j = 0; j < unicalWords.Count(); j++) {
      var key = unicalWords[j];
      var value = document.Where(w => w.Key == key).FirstOrDefault().Key != null ? document.Where(w => w.Key == key).FirstOrDefault().Value : 0;
      var conteinsCount = splitedDocuments.Where(w => w.Contains(key)).Count();
      var tf = (double) value / splitedDocuments[i].Count;
      var idf = 1 + Math.Log((double) splitedDocuments.Count / splitedDocuments.Where(w => w.Contains(key)).Count());
      double weight = tf * idf;
      matrix[j, i] = weight;
    }
  }
  return matrix;
}
private static double CalculateCosineSimilarity(double[] vecA, double[] vecB) {
  var dotProduct = DotProduct(vecA, vecB);
  var magnitudeOfA = Magnitude(vecA);
  var magnitudeOfB = Magnitude(vecB);
  return dotProduct / (magnitudeOfA * magnitudeOfB);
}
private static double DotProduct(double[] vecA, double[] vecB) {
  double dotProduct = 0;
  for (var i = 0; i < vecA.Length; i++) {
    dotProduct += (vecA[i] * vecB[i]);
  }
  return dotProduct;
}
private static double Magnitude(double[] vector) {
    return Math.Sqrt(DotProduct(vector, vector));
  }
  [, ] getContent() {
    List < List < string >> documents = new List < List < string >> ();
    string query = "life learning";
    string documnet1 = "The game of life is a game of everlasting learning";
    string documnet2 = "The unexamined life is not worth living";
    string documnet3 = "Never stop learning";
    var splitedQuery = query.Split(' ').ToList();
    var splitedDocument1 = documnet1.Split(' ').ToList();
    var splitedDocument2 = documnet2.Split(' ').ToList();
    var splitedDocument3 = documnet3.Split(' ').ToList();
    var unicalWords = (query + " " + document1 + " " + document2 + " " + document3).Split(' ').GroupBy(g => g).Select(s => s.Key).ToArray();
    documents.Add(splitedQuery);
    documents.Add(splitedDocument1);
    documents.Add(splitedDocument2);
    documents.Add(splitedDocument3);
    var array = GetWeights(documents, unicalWords);
    return array;

I want to check similarity of one document with multiple queries, store it an array sort by descending.


Answers (1)