Week 29

Here is the final code used to extract all the certain sentences from all of the articles.

It resulted in nearly 96,000 .csv documents of certain sentences ready for testing.

article-parsing.py

# coding: utf-8

import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import os
import csv

name = ['sent','hedge']

#training data for hedge-detection
train = pd.read_csv("../../HedgeDetection/bag-of-words/train/TRAIN_biomedical_fullarticles.csv",
        names=name, delimiter = "\t", quoting=3)
vectorizer = CountVectorizer(analyzer='word',
                             stop_words = "english",
                             ngram_range = (1,2),
                             lowercase = True)
train_data_feature = vectorizer.fit_transform(train['sent'])
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_feature, train["hedge"])
title = pd.read_csv('title.csv', names = ['article'])

i = 0
while i < title['article'].shape[0]:
    with open('fulltext/'+title['article'][i], 'r')as f:
        if os.stat('fulltext/'+title['article'][i]).st_size != 0:
            text = f.read()
            sentences = re.sub(r'\s+',' ',text)
            article_tokenize_list = sent_tokenize(sentences)
            #print(article_tokenize_list)
            article_data_feature = vectorizer.transform(article_tokenize_list)
            #print(article_data_feature)
            article_data_feature = article_data_feature.toarray()