nlp_hw1 (1).py

# -*- coding: utf-8 -*-
"""NLP_HW1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Haz5duS_tl0DsdNcNiElTjqOGHRaDXWn
"""

from types import resolve_bases
import sys
from collections import defaultdict
import math
import random
import os
import os.path
"""
COMS W4705 - Natural Language Processing - Fall 2022 
Programming Homework 1 - Trigram Language Models
Daniel Bauer
Alice Diakova
"""
def corpus_reader(corpusfile, lexicon=None): 
  with open(corpusfile,'r') as corpus: 
      for line in corpus: 
          if line.strip():
              sequence = line.lower().strip().split()
              if lexicon: 
                  yield [word if word in lexicon else "UNK" for word in sequence]
              else: 
                  yield sequence

def get_lexicon(corpus):
  word_counts = defaultdict(int)
  for sentence in corpus:
      for word in sentence: 
          word_counts[word] += 1
  return set(word for word in word_counts if word_counts[word] > 1)  
  
def get_ngrams_helper(sequence, n, res):
  
  temp_tup = list()
  temp_tup.append('START')
  x = 1
  for i in range(0, len(sequence)):
    if x == n:
      break
    if i == len(sequence) - 1:
      temp_tup.append('STOP')
      res.append(tuple(temp_tup))
      return res
    temp_tup.append(sequence[i])
    x+=1
  res.append(tuple(temp_tup))

  for i in range(0, len(sequence)):
    temp_tup = list()
    x = 0
    for j in range(i, len(sequence)+1):
      if x == n:
        break
      if j == len(sequence):
        temp_tup.append('STOP')
        res.append(tuple(temp_tup))
        return res
      temp_tup.append(sequence[j])
      x+=1
    res.append(tuple(temp_tup))
  if n == 1:
      temp_tup = list()
      temp_tup.append('STOP')
      res.append(tuple(temp_tup))
  return res

def get_ngrams(sequence, n):
  """
  COMPLETE THIS FUNCTION (PART 1)
  Given a sequence, this function should return a list of n-grams, where each n-
gram is a Python tuple.
  This should work for arbitrary values of 1 <= n < len(sequence).
  """

  if n == 0 or len(sequence) == 0:
    return []

  res = list()
  if n < len(sequence) or (n == 1 and len(sequence) == 1):
    res = get_ngrams_helper(sequence, n, res)
  else:
    sub = n - len(sequence)
    if sub >= 2: 
      #only need a single n-gram
      temp_tup = list()
      for i in range(0, sub-1):
        temp_tup.append('START')
      for w in sequence:
        temp_tup.append(w)
      temp_tup.append('STOP')
      res.append(tuple(temp_tup))
    else:
      start_count = n-1
      while start_count > 0: #iterates through each tuple, stops when we have seen the last word in a tuple but the tuple is not yet of length n
        t = start_count
        temp_tup = list()
        while t > 0:
          temp_tup.append('START')
          t -= 1
        i = 0
        while i < n-(start_count) and i < len(sequence):
          temp_tup.append(sequence[i])
          i+=1
        res.append(tuple(temp_tup))
        start_count -= 1
        
      #not worried about STARTs in n-grams anymore:
      for i in range(0, len(sequence)):
        temp_tup = list()
        x = 0
        for j in range(i, len(sequence)+1):
          if x == n:
            break
          if j == len(sequence):
            temp_tup.append('STOP')
            res.append(tuple(temp_tup))
            return res
          temp_tup.append(sequence[j])
          x+=1
        res.append(tuple(temp_tup))
  return res

class TrigramModel(object):

    def __init__(self, corpusfile):
    
        # Iterate through the corpus once to build a lexicon 
        generator = corpus_reader(corpusfile)
        self.lexicon = get_lexicon(generator)
        self.lexicon.add("UNK")
        self.lexicon.add("START")
        self.lexicon.add("STOP")
    
        # Now iterate through the corpus again and count ngrams
        generator = corpus_reader(corpusfile, self.lexicon)
        self.word_count = 0
        self.count_ngrams(generator)

    def count_ngrams(self, corpus):
        """
        COMPLETE THIS METHOD (PART 2)
        Given a corpus iterator, populate dictionaries of unigram, bigram,
        and trigram counts. 
        """
        self.unigramcounts = {} # might want to use defaultdict or Counter instead
        self.bigramcounts = {} 
        self.trigramcounts = {} 
        
        for sentence in corpus:
          sentence_unigrams = get_ngrams(sentence, 1)
          for unigram in sentence_unigrams:
            if unigram not in self.unigramcounts:
              self.unigramcounts[unigram] = 1
            else:
              self.unigramcounts[unigram] += 1
          
            if unigram[0] != 'START':
              # according to edstem: "The word count includes the count of stop tokens, but it excludes the count of start tokens"
              self.word_count += 1

          sentence_bigrams = get_ngrams(sentence, 2)
          for bigram in sentence_bigrams:
            if bigram not in self.bigramcounts:
              self.bigramcounts[bigram] = 1
            else:
              self.bigramcounts[bigram] += 1

          sentence_trigrams = get_ngrams(sentence, 3)
          for trigram in sentence_trigrams:
            if trigram not in self.trigramcounts:
              self.trigramcounts[trigram] = 1
            else:
              self.trigramcounts[trigram] += 1

        #print("word count: "+ str(self.word_count))
        return

    def raw_trigram_probability(self,trigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) trigram probability
        """
        if trigram not in self.trigramcounts.keys():
          return 0

        temp = list()
        temp.append(trigram[0])
        temp.append(trigram[1])
        bigram = tuple(temp)

        if bigram not in self.bigramcounts.keys():
          return 0

        return float(self.trigramcounts[trigram])/ self.bigramcounts[bigram]

    def raw_bigram_probability(self, bigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) bigram probability
        """
        if bigram not in self.bigramcounts.keys():
          return 0

        temp = list()
        temp.append(bigram[0])
        unigram = tuple(temp)

        return float(self.bigramcounts[bigram]) / self.unigramcounts[unigram]
    
    def raw_unigram_probability(self, unigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) unigram probability.
        """
        #hint: recomputing the denominator every time the method is called
        # can be slow! You might want to compute the total number of words once, 
        # store in the TrigramModel instance, and then re-use it.  
        
        if unigram not in self.unigramcounts.keys():
          return 0
        
        return float(self.unigramcounts[unigram]) / self.word_count

    def generate_sentence(self,t=20): 
        """
        COMPLETE THIS METHOD (OPTIONAL)
        Generate a random sentence from the trigram model. t specifies the
        max length, but the sentence may be shorter if STOP is reached.
        """
        return result        

    def smoothed_trigram_probability(self, trigram):
        """
        COMPLETE THIS METHOD (PART 4)
        Returns the smoothed trigram probability (using linear interpolation). 
        """
        lambda1 = 1/3.0
        lambda2 = 1/3.0
        lambda3 = 1/3.0

        temp = list()
        temp.append(trigram[0])
        temp.append(trigram[1])
        bigram = tuple(temp)

        temp = list()
        temp.append(trigram[0])
        unigram = tuple(temp)

        return lambda1 * self.raw_trigram_probability(trigram) + lambda2 * self.raw_bigram_probability(bigram) + lambda3 * self.raw_unigram_probability(unigram)
        
    def sentence_logprob(self, sentence):
        """
        COMPLETE THIS METHOD (PART 5)
        Returns the log probability of an entire sequence.
        """
        logprob = 0
        tri = get_ngrams(sentence, 3)
        for trigram in tri:
          logprob += math.log2(self.smoothed_trigram_probability(trigram))
        return logprob

    def perplexity(self, corpus):
        """
        COMPLETE THIS METHOD (PART 6) 
        Returns the log probability of an entire sequence.
        """
        
        res = 0
        for sentence in corpus:
          res += self.sentence_logprob(sentence)
        
        #res is now the sum of the log probs of each sentence

        res /= float(self.word_count)

        return 2**(-res)

def essay_scoring_experiment(training_file1, training_file2, testdir1, testdir2):
        model1 = TrigramModel(training_file1)
        model2 = TrigramModel(training_file2)
        total = 0
        correct = 0       
 
        for f in os.listdir(testdir1):
            pp = model1.perplexity(corpus_reader(os.path.join(testdir1, f), 
model1.lexicon))
            # .. 

            #NOTE: MY OTHER FUNCTIONS WERE LEADING TO THE PERPLEXITY ALL
            #BEING AROUND 1. I KNOW THE FOLLOWING THRESHOLD IS NOT IDEAL
            #FOR DETERMINING WHICH FILES WERE CORRECT BUT IT IS MY WAY OF
            #REPRESENTING HOW TO DETERMINE CORRECTNESS USING PERPLEXITY

            if pp < 1.001:
              correct += 1
            total += 1
    
        for f in os.listdir(testdir2):
            pp = model2.perplexity(corpus_reader(os.path.join(testdir2, f), 
model2.lexicon))
            # .. 
            if pp < 1.001:
              correct += 1
            total += 1
        
        return correct / float(total)
if __name__ == "__main__":
    #model = TrigramModel('hw1_data/brown_train.txt')

    # put test code here...
    # or run the script from the command line with 
    # $ python -i trigram_model.py [corpus_file]
    # >>> 
    #
    # you can then call methods on the model instance in the interactive 
    # Python prompt. 
    
    # Testing perplexity: 
    #dev_corpus = corpus_reader('hw1_data/brown_train.txt', model.lexicon)
    #pp = model.perplexity(dev_corpus)
    #print(pp)
    #Essay scoring experiment: 
    acc = essay_scoring_experiment('hw1_data/ets_toefl_data/train_high.txt', 
                                   'hw1_data/ets_toefl_data/train_low.txt', 
                                   "hw1_data/ets_toefl_data/test_high", 
                                   "hw1_data/ets_toefl_data/test_low")
    print(acc)