diff --git a/Task#5 -- CorEx Model-All Positive Articles (Text with coref).ipynb b/Task#5 -- CorEx Model-All Positive Articles (Text with coref).ipynb new file mode 100644 index 0000000..52c3d47 --- /dev/null +++ b/Task#5 -- CorEx Model-All Positive Articles (Text with coref).ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "#https://github.com/gregversteeg/corex_topic" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from corextopic import corextopic as ct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CorEx on Positive Articles" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " class text \\\n", + "0 positive Andhra Pradesh Chief Minister N Chandrababu Na... \n", + "1 positive This story is from January 16, 2018\\r\\r\\n\\r\\r\\... \n", + "2 positive A crowd angered over what they believed was th... \n", + "3 positive After having spent over 12 hours dousing flame... \n", + "4 positive The impact of drought on women farmers remains... \n", + "\n", + " text_coref \\\n", + "0 Andhra Pradesh Chief Minister N Chandrababu Na... \n", + "1 This story is from January 16, 2018\\r\\nBHOPAL:... \n", + "2 A crowd angered over what A crowd believed was... \n", + "3 After having spent over 12 hours dousing flame... \n", + "4 The impact of drought on women farmers remains... \n", + "\n", + " title \n", + "0 Maoists using bauxite mining as pretext to kil... \n", + "1 In Madhya Pradesh, even the dead talk \n", + "2 Two killed in violence over cow slaughter in n... \n", + "3 Army to take up Bellandur lake fire issue with... \n", + "4 True victims of farm crisis \n", + "\n", + "RangeIndex: 507 entries, 0 to 506\n", + "Data columns (total 4 columns):\n", + "class 507 non-null object\n", + "text 507 non-null object\n", + "text_coref 507 non-null object\n", + "title 507 non-null object\n", + "dtypes: object(4)\n", + "memory usage: 15.9+ KB\n", + "None\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "wri = pd.read_csv(\"wri_coref_110319.csv\",index_col=0)\n", + "\n", + "wri = wri[wri['class']=='positive']\n", + "wri.reset_index(drop=True, inplace=True)\n", + "\n", + "print(wri.head())\n", + "print(wri.info())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "vectorizer = TfidfVectorizer(\n", + " max_df=.5,\n", + " min_df=1,\n", + " max_features=None,\n", + " ngram_range=(1, 2),\n", + " norm=None,\n", + " binary=True,\n", + " use_idf=False,\n", + " sublinear_tf=False,\n", + " strip_accents = 'unicode',\n", + " stop_words = 'english'\n", + ")\n", + "\n", + "vectorizer = vectorizer.fit(wri['text_coref'])\n", + "tfidf = vectorizer.transform(wri['text_coref'])\n", + "vocab = vectorizer.get_feature_names()\n", + "print(len(vocab))\n", + "print(vocab)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "from corextopic import corextopic as ct" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "TOPICS = 7\n", + "NBR_OF_WORDS = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "anchors = []\n", + "model = ct.Corex(n_hidden=TOPICS, seed=42)\n", + "model = model.fit(\n", + " tfidf,\n", + " words=vocab\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keywords for Topic Anchors\n", + "['land', 'acre','hectares', 'acquisition', 'land acquisition', 'agricultural', 'acres', 'degradation','landslides','property','resettlement'],\n", + "\n", + "['farmer', 'farming', 'agricultural', 'produce', 'crop', 'crops', 'agrarian', 'farms','farm','field','fields','soil','sugarcane','vegetables','farmers','agriculture','tractor','prices crops', 'debt','quota','food','fruits','livestock','cow','wheat','harvest','harvesting','horticulture','loan','loans','milk','paddy','rice','plant','plants','potatoes','potato'],\n", + "\n", + "['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],\n", + "\n", + "['forest','forests', 'forest department', 'reserve', 'forest officials','forestry'],\n", + "\n", + "['animal','leopard','leopards', 'animals', 'wildlife', 'tiger', 'attacked', 'slaughter', 'lion','lions', 'threat', 'tigress', 'bear','birds','cat','cattle','crocodile','elephant','elephants','pangolin','pangolins','species'],\n", + "\n", + "['drought', 'droughts','monsoon', 'rain','rains','rainfall','disaster'],\n", + "\n", + "['water', 'irrigation', 'monsoon', 'rain', 'flood', 'floods', 'flooded', 'climate change','climate','dam','dams','drinking']\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# Anchors designed to nudge the model towards measuring specific genres\n", + "\n", + "anchors = [\n", + " ['land','resettlement','degradation','plot'],\n", + " ['farm','Farmers','crop','agriculture','crops','agrarian','farmer','farmers''cows','tractor','acre','fields','livestock','harvest','harvesting','potato','sugarcane','paddy','rice','milk'],\n", + " ['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],\n", + " ['forest','deforestation','trees'],\n", + " ['animal','attacked','leopard','leopards','tiger','tigress','crocodile'],\n", + " ['drought','rain','climate change'],\n", + " ['water','dams','irrigation','flood','drinking'] \n", + " \n", + "]\n", + "anchors = [\n", + " [a for a in topic if a in vocab]\n", + " for topic in anchors\n", + "]\n", + "\n", + "model = ct.Corex(n_hidden=TOPICS, seed=42)\n", + "model = model.fit(\n", + " tfidf,\n", + " words=vocab,\n", + " anchors=anchors, # Pass the anchors in here\n", + " anchor_strength=100 # Tell the model how much it should rely on the anchors\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic #1: land, resettlement, degradation\n", + "Topic #2: crops, farm, agriculture\n", + "Topic #3: mining, coal, sand\n", + "Topic #4: forest, trees, deforestation\n", + "Topic #5: animal, attacked, tiger\n", + "Topic #6: drought, climate change, rain\n", + "Topic #7: water, drinking, dams\n" + ] + } + ], + "source": [ + "for i, topic_ngrams in enumerate(model.get_topics(n_words=NBR_OF_WORDS)):\n", + " topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]\n", + " print(\"Topic #{}: {}\".format(i+1, \", \".join(topic_ngrams)))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "topic_df = pd.DataFrame(\n", + " model.transform(tfidf), \n", + " columns=[\"topic_{}\".format(i+1) for i in range(TOPICS)]\n", + ").astype(float)\n", + "topic_df.index = wri.index\n", + "wri = pd.concat([wri, topic_df], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0 0.65286\n", + "1.0 0.34714\n", + "Name: topic_1, dtype: float64\n", + "0.0 0.686391\n", + "1.0 0.313609\n", + "Name: topic_2, dtype: float64\n", + "0.0 0.767258\n", + "1.0 0.232742\n", + "Name: topic_3, dtype: float64\n", + "0.0 0.755424\n", + "1.0 0.244576\n", + "Name: topic_4, dtype: float64\n", + "0.0 0.779093\n", + "1.0 0.220907\n", + "Name: topic_5, dtype: float64\n", + "0.0 0.751479\n", + "1.0 0.248521\n", + "Name: topic_6, dtype: float64\n", + "0.0 0.613412\n", + "1.0 0.386588\n", + "Name: topic_7, dtype: float64\n" + ] + } + ], + "source": [ + "for i in range(TOPICS):\n", + " column='topic_{}'.format(i+1)\n", + " print(wri[column].value_counts(normalize=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0 0.335306\n", + "1.0 0.333333\n", + "3.0 0.147929\n", + "0.0 0.065089\n", + "4.0 0.061144\n", + "5.0 0.041420\n", + "6.0 0.015779\n", + "Name: topic, dtype: float64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Topic Flags\n", + "wri['topic']=wri['topic_1']+wri['topic_2']+wri['topic_3']+wri['topic_4']+wri['topic_5']+wri['topic_6']+wri['topic_7']\n", + "wri['topic'].value_counts(normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " class text \\\n", + "5 positive After 20 years of providing Manitoba livestock... \n", + "23 positive The fire was restricted to one section of the ... \n", + "71 positive New Delhi, Oct 2 (IANS) Left parties on Tuesda... \n", + "74 positive Updated: Aug 29, 2019 21:05 IST\\r\\r\\r\\nIndia r... \n", + "82 positive This refers to “Farmers march in De... \n", + "90 positive Chinese troops intrude into Arunachal with roa... \n", + "116 positive farmers’ protest\\r\\r\\r\\nNGO Nation\\... \n", + "129 positive The 'Kisan Kranti Padyatra', which was started... \n", + "141 positive Accept the updated privacy & cookie policy\\r\\r... \n", + "160 positive Advertisement\\r\\r\\n\\r\\r\\nBellandur Lake Group,... \n", + "238 positive Nagpur, MAHARASHTRA — Over one lakh... \n", + "243 positive A plea alleging that rainwater harvesting syst... \n", + "257 positive Prices for farmed vannamei shrimp should be fa... \n", + "274 positive Telangana IT and Industries Minister K T Rama ... \n", + "282 positive New Delhi: Uttar Pradesh’s top poli... \n", + "291 positive Chinese troops intrude into Arunachal with roa... \n", + "292 positive 6.15 am: Hardoi: Shelter home where only 2 out... \n", + "297 positive This story is from September 10, 2018\\r\\r\\r\\nR... \n", + "320 positive Two persons were killed and five others injure... \n", + "332 positive Four days after a fire put the spotlight back ... \n", + "345 positive Governor E.S.L. Narasimhan is being greeted by... \n", + "356 positive The National Green Tribunal has cancelled a ba... \n", + "365 positive Telangana IT and Industries Minister KT Rama R... \n", + "371 positive Advertisement\\r\\r\\n\\r\\r\\nPartha PaulA promoter... \n", + "374 positive The Delhi Police has imposed prohibitory order... \n", + "408 positive This story is from December 29, 2018\\r\\r\\r\\nEa... \n", + "435 positive Pune: As many as 185 people have been detained... \n", + "436 positive “If one inserts his hand inside a s... \n", + "439 positive At least 17 killed in New Delhi factory fire\\r... \n", + "446 positive The party had prepared a separate manifesto fo... \n", + "467 positive By ANI\\r\\r\\r\\nNEW DELHI: National Congress Par... \n", + "478 positive STATE TIMES NEWS\\r\\r\\n\\r\\r\\nJAMMU: Minister fo... \n", + "505 positive Chinese troops intrude into Arunachal with roa... \n", + "\n", + " text_coref \\\n", + "5 After 20 years of providing Manitoba livestock... \n", + "23 The fire was restricted to one section of the ... \n", + "71 New Delhi, Oct 2 (IANS) Left parties on Tuesda... \n", + "74 Updated: Aug 29, 2019 21:05 IST\\r\\r\\r\\r\\nIndia... \n", + "82 This refers to “Farmers march in De... \n", + "90 Chinese troops intrude into Arunachal with roa... \n", + "116 farmers’ protest\\r\\r\\r\\r\\nNGO Natio... \n", + "129 The 'Kisan Kranti Padyatra', which was started... \n", + "141 Accept the updated privacy & cookie policy\\r\\r... \n", + "160 Advertisement\\r\\nBellandur Lake Group, an info... \n", + "238 Nagpur, MAHARASHTRA — Over one lakh... \n", + "243 A plea alleging that rainwater harvesting syst... \n", + "257 Prices for farmed vannamei shrimp should be fa... \n", + "274 Telangana IT and Industries Minister K T Rama ... \n", + "282 New Delhi: Uttar Pradesh’s top poli... \n", + "291 Chinese troops intrude into Arunachal with roa... \n", + "292 6.15 am: Hardoi: Shelter home where only 2 out... \n", + "297 This story is from September 10, 2018\\r\\r\\r\\r\\... \n", + "320 Two persons were killed and five others injure... \n", + "332 Advertisement\\r\\nBellandur Lake Group, an info... \n", + "345 Governor E.S.L. Narasimhan is being greeted by... \n", + "356 The National Green Tribunal has cancelled a ba... \n", + "365 Telangana IT and Industries Minister KT Rama R... \n", + "371 Advertisement\\r\\nPartha PaulA promoter was gun... \n", + "374 The Delhi Police has imposed prohibitory order... \n", + "408 This story is from December 29, 2018\\r\\r\\r\\r\\n... \n", + "435 Pune: As many as 185 people have been detained... \n", + "436 “If one inserts his hand inside a s... \n", + "439 New Delhi: At least 17 workers were killed on ... \n", + "446 The party had prepared a separate manifesto fo... \n", + "467 By ANI\\r\\r\\r\\r\\nNEW DELHI: National Congress P... \n", + "478 STATE TIMES NEWS\\r\\nJAMMU: Minister for Agricu... \n", + "505 Chinese troops intrude into Arunachal with roa... \n", + "\n", + " title topic_1 topic_2 \\\n", + "5 Changes Coming To Manitoba Livestock Manure Ma... 0.0 0.0 \n", + "23 Gujarat: 3 killed as fire breaks out at Relian... 0.0 0.0 \n", + "71 Left parties condemn lathicharge on farmers at... 0.0 0.0 \n", + "74 AIADMK MPs protest against construction of dam... 0.0 0.0 \n", + "82 Letter to BS: Farmers urge govt to hold specia... 0.0 0.0 \n", + "90 Chinese troops intrude into Arunachal with roa... 0.0 0.0 \n", + "116 March in Dadar today to support farmers 0.0 0.0 \n", + "129 Kisan kranti yatra: Farmers call off protest a... 0.0 0.0 \n", + "141 Ghaziabad development body issues eviction not... 0.0 0.0 \n", + "160 New deadlines to revive Bellandur lake 0.0 0.0 \n", + "238 Farmers' March: Why Lakhs Of Protesters Will B... 0.0 0.0 \n", + "243 NGT notice to Delhi govt over rainwater harves... 0.0 0.0 \n", + "257 Gulkin: Global farmed shrimp prices likely sta... 0.0 0.0 \n", + "274 KTR seeks Telangana-NRIs partnership in state'... 0.0 0.0 \n", + "282 Police suspect foul play in Bulandshahr mob fury 0.0 0.0 \n", + "291 Chinese troops intrude into Arunachal with roa... 0.0 0.0 \n", + "292 News Alert! Maratha Reservation: Bicycles set ... 0.0 0.0 \n", + "297 Fearing crop failure, farmer ends life in Amreli 0.0 0.0 \n", + "320 Two dead, five injured as truck hits tractor 0.0 0.0 \n", + "332 New deadlines to revive Bellandur lake 0.0 0.0 \n", + "345 Telangana farmers get free power; KCR announce... 0.0 0.0 \n", + "356 Rainwater harvesting: NGT cancels warrant agai... 0.0 0.0 \n", + "365 World Economic Forum 2018: KT Rama Rao seeks T... 0.0 0.0 \n", + "371 Kolkata: Promoter shoots dead business partner... 0.0 0.0 \n", + "374 Delhi Police imposes prohibitory orders ahead ... 0.0 0.0 \n", + "408 Teams of ex-servicemen to be deployed for patr... 0.0 0.0 \n", + "435 Maharashtra bandh: Hundreds detained for resor... 0.0 0.0 \n", + "436 Shiv Sena apologises to Tigress Avni, calls fo... 0.0 0.0 \n", + "439 At least 17 killed in New Delhi factory fire 0.0 0.0 \n", + "446 BJP banking on women, Congress on farmers 0.0 0.0 \n", + "467 NCP chief Sharad Pawar hits out at Centre for ... 0.0 0.0 \n", + "478 Hanjura calls for adopting latest farm technology 0.0 0.0 \n", + "505 Chinese troops intrude into Arunachal with roa... 0.0 0.0 \n", + "\n", + " topic_3 topic_4 topic_5 topic_6 topic_7 topic \n", + "5 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "23 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "71 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "74 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "82 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "90 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "116 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "129 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "141 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "160 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "238 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "243 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "257 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "274 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "282 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "291 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "292 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "297 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "320 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "332 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "345 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "356 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "365 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "371 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "374 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "408 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "435 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "436 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "439 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "446 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "467 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "478 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "505 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "Int64Index: 33 entries, 5 to 505\n", + "Data columns (total 12 columns):\n", + "class 33 non-null object\n", + "text 33 non-null object\n", + "text_coref 33 non-null object\n", + "title 33 non-null object\n", + "topic_1 33 non-null float64\n", + "topic_2 33 non-null float64\n", + "topic_3 33 non-null float64\n", + "topic_4 33 non-null float64\n", + "topic_5 33 non-null float64\n", + "topic_6 33 non-null float64\n", + "topic_7 33 non-null float64\n", + "topic 33 non-null float64\n", + "dtypes: float64(8), object(4)\n", + "memory usage: 3.4+ KB\n", + "None\n" + ] + } + ], + "source": [ + "MisTagged = wri[wri['topic']==0]\n", + "MisTagged.to_csv(\"MisTagged.csv\")\n", + "print(MisTagged)\n", + "print(MisTagged.info())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}