From d97efad12a74435da0389a735117883ba14e5fb2 Mon Sep 17 00:00:00 2001 From: jyotsnaparyani Date: Mon, 8 May 2017 14:42:52 -0500 Subject: [PATCH] Add files via upload --- CalculateDiffusionIndexMapper.py | 31 ++++++++++++++++++++ CalculateDiffusionIndexReducer.py | 42 ++++++++++++++++++++++++++++ CalculateGeographicIndexMapper.py | 33 ++++++++++++++++++++++ CalculateGeographicIndexReducer.py | 45 ++++++++++++++++++++++++++++++ CalculateSpamIndexReducer.py | 40 ++++++++++++++++++++++++++ CalculateTweetCountMapper.py | 8 ++++++ CalculateTweetCountReducer.py | 16 +++++++++++ 7 files changed, 215 insertions(+) create mode 100644 CalculateDiffusionIndexMapper.py create mode 100644 CalculateDiffusionIndexReducer.py create mode 100644 CalculateGeographicIndexMapper.py create mode 100644 CalculateGeographicIndexReducer.py create mode 100644 CalculateSpamIndexReducer.py create mode 100644 CalculateTweetCountMapper.py create mode 100644 CalculateTweetCountReducer.py diff --git a/CalculateDiffusionIndexMapper.py b/CalculateDiffusionIndexMapper.py new file mode 100644 index 0000000..c236b3a --- /dev/null +++ b/CalculateDiffusionIndexMapper.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + + +import sys,json + +def readFileandReturnAnArray(fileName, readMode, isLower): + myArray=[] + with open(fileName, readMode) as readHandle: + for line in readHandle.readlines(): + lineRead = line + if isLower: + lineRead = lineRead.lower() + myArray.append(lineRead.strip().lstrip()) + readHandle.close() + return myArray + + +for line in sys.stdin: + parsed_json_tweet = json.loads(line) + tweets_text = parsed_json_tweet['text'].lstrip().strip() + user_handle = parsed_json_tweet['user']['screen_name'].strip() + user_handle = user_handle.encode('ascii', 'ignore') + if user_handle is not None: + username = user_handle.strip().lstrip() + topicfiles = ["foodtopic1", "foodtopic2", "foodtopic3"] + for i in topicfiles: + topics = readFileandReturnAnArray(i, "r", True) + topicId = topics.pop(0) + for keyword in topics: + if keyword in tweets_text : + print '%s\t%s' %(topicId,username) diff --git a/CalculateDiffusionIndexReducer.py b/CalculateDiffusionIndexReducer.py new file mode 100644 index 0000000..98bbe32 --- /dev/null +++ b/CalculateDiffusionIndexReducer.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + + +from operator import itemgetter +import sys + +def readFileandReturnAnArray(fileName, readMode, isLower): + + with open(fileName, readMode) as readHandle: + for line in readHandle.readlines(): + lineRead = line + if isLower: + lineRead = lineRead.lower() + count = (lineRead.strip().lstrip()) + readHandle.close() + return count + +totalTweetCount = readFileandReturnAnArray("tweetCount", "r", True) +totalTweetCount = float(totalTweetCount) + +geo_score = {} + +for line in sys.stdin: + line = line.strip() + if line!="" and line is not None: + topicId, username = line.split('\t') + + if topicId in geo_score: + existingValues = geo_score.get(topicId) + if username is not "" and username != None and username not in existingValues: + geo_score[topicId].append(username) + else: + geo_score[topicId] = [] + if username is not "" and username != None: + geo_score[topicId].append(username) + + +for topic in geo_score.keys(): + list_of_values = geo_score[topic] + length = len(list_of_values) + final_score = float(length)/totalTweetCount + print '%s\t%s'% (topic, final_score) diff --git a/CalculateGeographicIndexMapper.py b/CalculateGeographicIndexMapper.py new file mode 100644 index 0000000..778c288 --- /dev/null +++ b/CalculateGeographicIndexMapper.py @@ -0,0 +1,33 @@ + +#!/usr/bin/env python + + +import sys,json + +def readFileandReturnAnArray(fileName, readMode, isLower): + myArray=[] + with open(fileName, readMode) as readHandle: + for line in readHandle.readlines(): + lineRead = line + if isLower: + lineRead = lineRead.lower() + myArray.append(lineRead.strip().lstrip()) + readHandle.close() + return myArray + + +for line in sys.stdin: + parsed_json_tweet = json.loads(line) + tweets_text = parsed_json_tweet['text'].lstrip().strip() + user_location = parsed_json_tweet['user']['location'] + if user_location is not None: + user_location = user_location.encode('ascii', 'ignore') + location = user_location.strip().lstrip() + topicfiles = ["foodtopic1", "foodtopic2", "foodtopic3"] + for i in topicfiles: + topics = readFileandReturnAnArray(i, "r", True) + topicId = topics.pop(0) + for keyword in topics: + if keyword in tweets_text : + if location is not None and location is not '' and location is not " " and location!="": + print '%s\t%s' %(topicId,location) diff --git a/CalculateGeographicIndexReducer.py b/CalculateGeographicIndexReducer.py new file mode 100644 index 0000000..a7559dd --- /dev/null +++ b/CalculateGeographicIndexReducer.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + + +import sys,json + +def readFileandReturnAnArray(fileName, readMode, isLower): + with open(fileName, readMode) as readHandle: + for line in readHandle.readlines(): + lineRead = line + if isLower: + lineRead = lineRead.lower() + count = (lineRead.strip().lstrip()) + readHandle.close() + return count + +totalTweetCount = readFileandReturnAnArray("foodTweetCount1", "r", True) +totalTweetCount = float(totalTweetCount) + +geo_score = {} + +for line in sys.stdin: + line = line.strip() + if line!="" and line is not None: + cols = line.split('\t') + topicId = line[0] + if line[1:] is not None: + location = line[1:] + else: + location = None + + if topicId in geo_score: + existingValues = geo_score.get(topicId) + if location is not "" and location != None and location not in existingValues and location!="noLocation": + geo_score[topicId].append(location) + else: + geo_score[topicId] = [] + if location is not "" and location != None and location!="noLocation": + geo_score[topicId].append(location) + + +for topic in geo_score.keys(): + list_of_values = geo_score[topic] + length = len(list_of_values) + final_score = float(length)/totalTweetCount + print '%s\t%s'% (topic, final_score) diff --git a/CalculateSpamIndexReducer.py b/CalculateSpamIndexReducer.py new file mode 100644 index 0000000..725fee4 --- /dev/null +++ b/CalculateSpamIndexReducer.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import sys + +def readFileandReturnAnArray(fileName, readMode, isLower): + + with open(fileName, readMode) as readHandle: + for line in readHandle.readlines(): + lineRead = line + if isLower: + lineRead = lineRead.lower() + count = (lineRead.strip().lstrip()) + readHandle.close() + return count + +totalTweetCount = readFileandReturnAnArray("tweetCount", "r", True) +totalTweetCount = float(totalTweetCount) + +geo_score = {} + +for line in sys.stdin: + line = line.strip() + if line!="" and line is not None: + topicId, username = line.split('\t') + if topicId in geo_score: + existingValues = geo_score.get(topicId) + if username is not "" and username != None and username not in existingValues: + geo_score[topicId].append(username) + else: + geo_score[topicId] = [] + if username is not "" and username != None: + geo_score[topicId].append(username) + + +for topic in geo_score.keys(): + list_of_values = geo_score[topic] + length = len(list_of_values) + spamIndex = 1.00/length + final_score = float(spamIndex)/totalTweetCount + print '%s\t%s'% (topic, final_score) diff --git a/CalculateTweetCountMapper.py b/CalculateTweetCountMapper.py new file mode 100644 index 0000000..465d98e --- /dev/null +++ b/CalculateTweetCountMapper.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python + +import sys,json + + +for line in sys.stdin: + if len(line.strip()) > 1: + print "%s\t%s" %("Tweet Count:",1) diff --git a/CalculateTweetCountReducer.py b/CalculateTweetCountReducer.py new file mode 100644 index 0000000..b3bf357 --- /dev/null +++ b/CalculateTweetCountReducer.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +from operator import itemgetter +import sys + +current_count = 0 + +for line in sys.stdin: + line = line.strip() + tweetcount, count = line.split('\t', 1) + try: + count = int(count) + except ValueError: + continue + current_count += count +print '%s' % (current_count)