forked from sfu-dataminers/cmpt733-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kafka_consumer.py
93 lines (77 loc) · 3.21 KB
/
kafka_consumer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from kafka import KafkaConsumer
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import from_json, get_json_object, col, udf
from pyspark.sql.types import StructField, StructType, StringType
from pyspark.sql import SparkSession
import sys
from pyspark.sql import functions as F
import tweepy as tw
import config_file as cf
from time import sleep
import logging
from LSTM.lstm_test import predict_sentiment
from LDA.lda_test import get_topic
from twitter.nrc_lexicon import fetch_nrc, preprocess
from twitter.responses import fetch_response
def get_api(row):
# Fetching tweet response based on Sentiment, Topic and NRC_Sentiment
text = fetch_response(row.text, row.Sentiment, row.Topic, row.NRC_Sentiment)
# Twitter authentication
auth = tw.OAuthHandler(cf.consumer_key, cf.consumer_secret)
auth.set_access_token(cf.access_token, cf.access_token_secret)
api = tw.API(auth)
try:
if id is not None:
logging.error("PREPARING TO REPLY............")
# Tweeting the response to the user
api.update_status(status = "@" + row.user + " " + text,
in_reply_to_status_id = row.id , auto_populate_reply_metadata=True)
else:
logging.error("ELSE CONDITION............")
except tw.TweepError as e:
logging.error(f"EXCEPTION {e.reason}")
sleep(2)
def main(topic):
logging.error("START OF MAIN!!!!!!!!!!!!!!")
# Initializing kafka readStream
messages = spark.readStream.format('kafka') \
.option('kafka.bootstrap.servers', 'localhost:9092') \
.option('subscribe', topic).load()
values = messages.selectExpr("CAST(value AS STRING)")
# Tweet Schema for raw data
schema = StructType(
[
StructField("id", StringType()),
StructField("text", StringType()),
StructField("user", StringType())
]
)
# Fetching required data from twitter
raw_df = values.withColumn("value", from_json("value", schema))\
.select(col('value.*'))
new_df = raw_df.select(raw_df.id, raw_df.text,\
get_json_object(raw_df.user, '$.screen_name').alias("user"))
# Creating udf functions for predicting sentiment, fetching topic and nrc emotion
predict_sentiment_udf = udf(predict_sentiment,StringType())
get_topic_udf = udf(get_topic, StringType())
preprocess_udf = udf(preprocess, StringType())
nrc_sentiment_udf = udf(fetch_nrc, StringType())
new_df = new_df.withColumn('Sentiment',predict_sentiment_udf(raw_df.text))
new_df = new_df.withColumn('Topic',get_topic_udf(raw_df.text))
tokenized_text = preprocess_udf(raw_df.text)
new_df = new_df.withColumn('NRC_Sentiment',nrc_sentiment_udf(tokenized_text))
# Passing kafka data to get_api function
stream = new_df.writeStream.format('console').foreach(
get_api
).start()
logging.error("After starting stream")
sleep(10000)
stream.stop()
if __name__ == '__main__':
topic = sys.argv[1]
spark = SparkSession.builder.appName('dataminers').getOrCreate()
# Make sure we have Spark 3.0+
assert spark.version >= '3.0'
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
main(topic)