Spaces:
Runtime error
Runtime error
| import nltk | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| nltk.download('punkt') | |
| from nltk.corpus import stopwords,wordnet | |
| from nltk.tokenize import sent_tokenize | |
| import string | |
| import subprocess | |
| import logging | |
| try: | |
| import pke | |
| logging.error("importing pke info") | |
| except: | |
| logging.error("installing pke info") | |
| subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git']) | |
| subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en']) | |
| import pke | |
| stoplist = list(string.punctuation) | |
| stoplist += pke.lang.stopwords.get('en') | |
| stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] | |
| stoplist += stopwords.words('english') | |
| def tokenize_sentence(text): | |
| sentences=sent_tokenize(text) | |
| sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20] | |
| return sentences | |
| def get_multipartiterank_topics(text): | |
| output = [] | |
| try: | |
| extractor = pke.unsupervised.MultipartiteRank() | |
| extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) | |
| # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB' | |
| extractor.candidate_selection(pos={'NOUN','VERB','ADJ'}) | |
| extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1) | |
| keyphrases = extractor.get_n_best(n=5) | |
| for val in keyphrases: | |
| output.append(val[0]) | |
| except Exception as e: | |
| print("found exception",e) | |
| return list(set(output)) | |
| def get_topicrank_topics(text): | |
| output = [] | |
| try: | |
| extractor = pke.unsupervised.TopicRank() | |
| extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) | |
| # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB' | |
| extractor.candidate_selection(pos={'NOUN', 'ADJ'}) | |
| extractor.candidate_weighting(threshold=0.7,method='average') | |
| keyphrases = extractor.get_n_best(n=5) | |
| for val in keyphrases: | |
| output.append(val[0]) | |
| except Exception as e: | |
| print("found exception",e) | |
| return list(set(output)) | |
| def get_yake_topics(text): | |
| #statistics model --very poor performance | |
| output = [] | |
| try: | |
| extractor = pke.unsupervised.YAKE() | |
| extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) | |
| extractor.candidate_selection(n=3) | |
| extractor.candidate_weighting(window=2) | |
| keyphrases = extractor.get_n_best(n=5,threshold=0.9) | |
| for val in keyphrases: | |
| output.append(val[0]) | |
| except Exception as e: | |
| print("found exception",e) | |
| return list(set(output)) | |