Week 25

My task: To work on creating the code to separate and test the articles from the new dataset we created

There are over 100,000 articles in this new dataset. So, in the following code, I attempted to combine sections of previous codes created in the beginning stages of the research and apply them to a folder of articles instead than of only one article.

# coding: utf-8

import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import os

name = ['sent','hedge']

#training data for hedge-detection
train = pd.read_csv("../../parseSent/usingBOW/TRAIN_biomedical_fullarticles_version2.csv", names=name, delimiter = "\t", quoting=3)

vectorizer = CountVectorizer(analyzer='word',
 stop_words = "english",
 ngram_range = (1,2),
 lowercase = True
train_data_feature = vectorizer.fit_transform(train['sent'])
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_feature, train["hedge"])

# title.csv is a document containing only the titles of all 100,003 files in the dataset
title = pd.read_csv('title.csv', names = ['text'])

print(title.head()) # prints the first 5 file names               
#                article
# 0  ainews:00012214.txt
# 1  ainews:0004CF01.txt
# 2  ainews:000580DE.txt
# 3  ainews:00059AEB.txt
# 4  ainews:0006365E.txt

print(title['text'][0]) # prints the first file name in the array
# ainews:00012214.txt

print(title['text'].shape[0]) # prints the total number of file names in the array
# 100003

# runs through the dataset and tests each article in the dataset 
# outputs the certain sentences of each article resulting from each test into a folder labeled "certain-articles"
# this section is not succesful yet
i = 0
while i<title['text'].shape[0]:
 with open('fulltext/'+title['text'][i], 'r') as myfile:
 if os.stat('fulltext/'+title['text'][i]).st_size != 0:
 article_tokenize_list[i] = sent_tokenize(article[i])
 article_data_feature[i] = vectorizer.transform(article_tokenize_list[i])
 article_data_feature[i] = article_data_feature[i].toarray()
 result[i] = forest.predict(article_data_feature[i])
 output[i] = pd.DataFrame(data={"sent":article_tokenize_list[i], "hedge":result[i]})
 output_certain[i] = output[i][output[i].hedge==1]
 output_certain[i] = output_certain[i]['sent']