import requests import json import sys import os import pandas import time username = "" password = "" old_column_name = ["id", "Openness", "Openness_e", "Adventurousness", "Adventurousness_e", "Artistic interests", "Artistic interests_e", "Emotionality", "Emotionality_e", "Imagination", "Imagination_e", "Intellect", "Intellect_e", "Authority-challenging", "Authority-challenging_e", "Conscientiousness", "Conscientiousness_e", "Achievement striving", "Achievement striving_e", "Cautiousness", "Cautiousness_e", "Dutifulness", "Dutifulness_e", "Orderliness", "Orderliness_e", "Self-discipline", "Self-discipline_e", "Self-efficacy", "Self-efficacy_e", "Extraversion", "Extraversion_e", "Activity level", "Activity level_e", "Assertiveness", "Assertiveness_e", "Cheerfulness", "Cheerfulness_e", "Excitement-seeking", "Excitement-seeking_e", "Outgoing", "Outgoing_e", "Gregariousness", "Gregariousness_e", "Agreeableness", "Agreeableness_e", "Altruism", "Altruism_e", "Cooperation", "Cooperation_e", "Modesty", "Modesty_e", "Uncompromising", "Uncompromising_e", "Sympathy", "Sympathy_e", "Trust", "Trust_e", "Emotional range", "Emotional range_e", "Fiery", "Fiery_e", "Prone to worry", "Prone to worry_e", "Melancholy", "Melancholy_e", "Immoderation", "Immoderation_e", "Self-consciousness", "Self-consciousness_e", "Susceptible to stress", "Susceptible to stress_e", "Challenge", "Challenge_e", "Closeness", "Closeness_e", "Curiosity", "Curiosity_e", "Excitement", "Excitement_e", "Harmony", "Harmony_e", "Ideal", "Ideal_e", "Liberty", "Liberty_e", "Love", "Love_e", "Practicality", "Practicality_e", "Self-expression", "Self-expression_e", "Stability", "Stability_e", "Structure", "Structure_e", "Conservation", "Conservation_e", "Openness to change", "Openness to change_e", "Hedonism", "Hedonism_e", "Self-enhancement", "Self-enhancement_e", "Self-transcendence", "Self-transcendence_e"] column_name = ['id'] for column in old_column_name[1:]: column_name.append(column) column_name.append('R_' + column) # def pre_aggregate(dir_name, file_list): # """Aggregate the data for each person and output a txt file for each person to directory 'Aggregated_files' # Args: # ===== # dir_name: The name of the directory that containing all the data to be aggregated # file_list: The list of all the files in the target directory # Returns: # ======== # output_file_list: The list of filenames in the output directory 'Aggregated_files' # """ # name_list = [] # i = 0 # out_dir_name ='Aggregated_files' # if not os.path.exists(out_dir_name): # os.mkdir(out_dir_name) # for file_name in file_list: # name_list = [] # with open(dir_name + '/' +file_name, "rb") as f: # for line in f: # # get rid of the first line # if i == 0: # i += 1 # continue # try: # [cid, ordernum, fname, lname, content] = line.strip().split('\t') # except: # [cid, ordernum, fname, lname] = line.strip().split('\t') # print line # if '?' in lname: # lname = lname.strip('?') # if (fname, lname) not in name_list: # fout = open(out_dir_name + '/'+ '_'.join((fname, lname))+'.txt', "w") # fout.write(fname+'\t'+lname+'\n') # fout.write(cid+'\t'+content+'\n\n') # name_list.append((fname, lname)) # else: # fout = open(out_dir_name + '/'+ '_'.join((fname, lname))+'.txt', "a") # fout.write(cid+'\t'+content+'\n'+'\n') # fout.close() # output_file_list = ['_'.join((x[0], x[1]))+'.txt' for x in name_list] # return output_file_list # def read_data(dir_name, file_list): # """Read all the data from a specific directory # Args: # ===== # dir_name: The name of the directory that containing all the aggregated data # file_list: The list of all the files in the target directory # Returns: # ======== # text: A list of tuple which contains the text, id, and type of each file # """ # text = [] # for file_name in file_list: # if file_name[-3:] == "txt": # content = process_txt_file(dir_name + "/" + file_name) # tmp = file_name[:-4] # text.append((content, tmp.split('_')[0], tmp.split('_')[1])) # # elif file_name[-3:] == "csv": # # data = process_csv_file(dir_name + "/" + file_name) # # if data is not None: # # text.append((data, file_name[:-4], "csv")) # return text # def process_txt_file(file_name): # """Clean the txt file # Args: # ===== # file_name: The name of the file which you want to process # Returns: # ======== # content: pure text of the file # """ # i = 0 # content = "" # with open(file_name, "rb") as f: # for line in f: # # get rid of the first line # if i == 0: # i += 1 # continue # content += " ".join(line.strip().split(" ")[1:]) # f.close() # return content # def process_csv_file(file_name): # """Clean the csv file and get rid of useless csv files # Args: # ===== # file_name: The name of the file which you want to process # Returns: # ======== # content: pure text of the file # """ # file = pandas.read_csv(file_name) # if "tweet" not in file.columns: # return None # else: # content = " ".join(file["tweet"].tolist()) # return content def process_txt_file_2(dir_name, file_name): i = 0 text = [] with open(dir_name + "/" + file_name, "rb") as f: for line in f: # get rid of the first line if i == 0: i += 1 continue if line == '\r\n' or line == '\n': continue if len(text) > 0 and text[-1][0] == line.split('\t')[0]: text[-1][1] += line.split('\t')[1] continue text.append(line.split('\t')) f.close() return text def parse_json(tree, id): """Get all the personalities and their corresponding sampling error from Args: ===== tree: a nested dictionary which contains all the information that we get from IBM watson id: the id of the current user type: file type Returns: ======== personalities: a list of all the personalities """ personalities = [id] if "tree" not in tree: personalities.extend(["0"] * (len(column_name) - 1)) else: # for name in tree["tree"]["children"]: # for personality in name["children"][0]["children"]: # personalities.extend([str(personality["percentage"]), str(personality["sampling_error"])]) # if name["name"] == "Big 5": # for sub_personality in personality["children"]: # personalities.extend([str(sub_personality["percentage"]), str(sub_personality["sampling_error"])]) for name in tree["tree"]["children"]: for personality in name["children"][0]["children"]: personalities.extend([str(personality["percentage"]), str(personality["raw_score"]), str(personality["sampling_error"]), str(personality["raw_sampling_error"])]) if name["name"] == "Big 5": for sub_personality in personality["children"]: personalities.extend([str(sub_personality["percentage"]), str(sub_personality["raw_score"]), str(sub_personality["sampling_error"]), str(sub_personality["raw_sampling_error"])]) return personalities def analyze_text(text): """ Use IBM watson to analyze the text """ total_personalities = map(lambda info: parse_json( json.loads(requests.post("https://gateway.watsonplatform.net/personality-insights/api/v2/profile", auth=(username, password), headers = {"content-type": "text/plain"}, params = {"include_raw": True}, data = info[1]).text), info[0]), text) return total_personalities def save_to_txt_file(output_filename, header, data): """Save result to file Args: ===== header: schema data: the data you want to save """ with open(output_filename, "wb") as f: f.write("\t".join(header) + "\n") final_data = "\n".join(map(lambda x: "\t".join(x), data)) f.write(final_data) f.close() def main(): start = time.time() dir_name = sys.argv[1] file_list = os.listdir(dir_name) if not os.path.isdir('output/'): os.mkdir("output/") for file_name in file_list: data = process_txt_file_2(dir_name, file_name) # for tag in data: # print tag columns = analyze_text(data) save_to_txt_file('output/' + file_name, column_name, columns) end = time.time() print end - start if __name__ == '__main__': main()