I'm struggling to find a faster method in python to compare each word in a row against a list of over 1M words.
Does anyone know a faster way to do it?
This is my code but it takes more than 30 minutes to do the check
def main(): [i for i in range(0, 100)] start_time = time.clock() col_target = 'all_FINAL_KEYWORDS' col_restrict = 'brands' # function to remove punctuation from columns def remove_punctuations(text): for punctuation in string.punctuation: text = str(text).replace(punctuation, ' ') return text df['all_FINAL_KEYWORDS'] = df['all_FINAL_KEYWORDS'].apply(remove_punctuations) df['brands'] = df['brands'].apply(remove_punctuations) def parse_str_into_list(s): if str(s).startswith('[') and str(s).endswith(']'): return ' '.join(str(s).strip('[]').strip("'").split("', '")) return str(s) def filter_restrict_words(row): targets = parse_str_into_list(row[0]).split(' ', -1) restricts = df[col_restrict].tolist() # print('processing...') words_to_keep = [] for word in targets: # condition to keep eligible words if word not in restricts and word not in words_to_keep: words_to_keep.append(word) print(words_to_keep) return ' '.join(words_to_keep) df['all_FINAL_KEYWORDS'] = df[[col_target, col_restrict]].apply(lambda x: filter_restrict_words(x), axis=1) main() print('time duration: ', time.clock() - start_time, "seconds")
https://stackoverflow.com/questions/66810081/how-to-compare-efficiently-each-string-in-a-row-against-over-1m-strings-in-pytho March 26, 2021 at 10:03AM
没有评论:
发表评论