Spaces:
Runtime error
Runtime error
first commit
Browse files- .dockerignore +20 -0
- .gitignore +160 -0
- README.md +5 -5
- app.py +70 -0
- configuration.py +13 -0
- dataset.py +12 -0
- model.py +46 -0
- models_file/config.pth +3 -0
- models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth +3 -0
- models_file/tokenizer/merges.txt +0 -0
- models_file/tokenizer/special_tokens_map.json +51 -0
- models_file/tokenizer/tokenizer.json +0 -0
- models_file/tokenizer/tokenizer_config.json +66 -0
- models_file/tokenizer/vocab.json +0 -0
- requirements.txt +5 -0
- utils.py +104 -0
.dockerignore
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
env
|
| 7 |
+
pip-log.txt
|
| 8 |
+
pip-delete-this-directory.txt
|
| 9 |
+
.tox
|
| 10 |
+
.coverage
|
| 11 |
+
.coverage.*
|
| 12 |
+
.cache
|
| 13 |
+
nosetests.xml
|
| 14 |
+
coverage.xml
|
| 15 |
+
*.cover
|
| 16 |
+
*.log
|
| 17 |
+
.git
|
| 18 |
+
.mypy_cache
|
| 19 |
+
.pytest_cache
|
| 20 |
+
.hypothesis
|
.gitignore
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
#.idea/
|
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Entity Extraction
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 3.28.3
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
app.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
if __name__ == '__main__':
|
| 2 |
+
inputs = ['gbjjhbdjhbdgjhdbfjhsdkjrkjf', 'fdjhbjhsbd']
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
from model import CustomModel
|
| 5 |
+
import torch
|
| 6 |
+
from configuration import CFG
|
| 7 |
+
from dataset import SingleInputDataset
|
| 8 |
+
from torch.utils.data import DataLoader
|
| 9 |
+
from utils import inference_fn, get_char_probs, get_results, get_text
|
| 10 |
+
import numpy as np
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
device = torch.device('cpu')
|
| 15 |
+
config_path = os.path.join('models_file', 'config.pth')
|
| 16 |
+
model_path = os.path.join('models_file', 'microsoft-deberta-base_0.9449373420387531_8_best.pth')
|
| 17 |
+
tokenizer = AutoTokenizer.from_pretrained('models_file/tokenizer')
|
| 18 |
+
model = CustomModel(CFG, config_path=config_path, pretrained=False)
|
| 19 |
+
state = torch.load(model_path,
|
| 20 |
+
map_location=torch.device('cpu'))
|
| 21 |
+
model.load_state_dict(state['model'])
|
| 22 |
+
|
| 23 |
+
def get_answer(context, feature):
|
| 24 |
+
|
| 25 |
+
## Input to the model using patient-history and feature-text
|
| 26 |
+
inputs_single = tokenizer(context, feature,
|
| 27 |
+
add_special_tokens=True,
|
| 28 |
+
max_length=CFG.max_len,
|
| 29 |
+
padding="max_length",
|
| 30 |
+
return_offsets_mapping=False)
|
| 31 |
+
|
| 32 |
+
for k, v in inputs_single.items():
|
| 33 |
+
inputs_single[k] = torch.tensor(v, dtype=torch.long)
|
| 34 |
+
|
| 35 |
+
# Create a new dataset containing only the input sample
|
| 36 |
+
single_input_dataset = SingleInputDataset(inputs_single)
|
| 37 |
+
# Create a DataLoader for the new dataset
|
| 38 |
+
single_input_loader = DataLoader(
|
| 39 |
+
single_input_dataset,
|
| 40 |
+
batch_size=1,
|
| 41 |
+
shuffle=False,
|
| 42 |
+
num_workers=2
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Perform inference on the single input
|
| 46 |
+
output = inference_fn(single_input_loader, model, device)
|
| 47 |
+
|
| 48 |
+
prediction = output.reshape((1, CFG.max_len))
|
| 49 |
+
char_probs = get_char_probs([context], prediction, tokenizer)
|
| 50 |
+
predictions = np.mean([char_probs], axis=0)
|
| 51 |
+
results = get_results(predictions, th=0.5)
|
| 52 |
+
|
| 53 |
+
print(results)
|
| 54 |
+
return get_text(context, results[0])
|
| 55 |
+
|
| 56 |
+
inputs = [gr.inputs.Textbox(label="Context Para", lines=10), gr.inputs.Textbox(label="Question", lines=1)]
|
| 57 |
+
output = gr.outputs.Textbox(label="Answer")
|
| 58 |
+
article = "<p style='text-align: center'><a href='https://www.xelpmoc.in/' target='_blank'>Made by Xelpmoc</a></p>"
|
| 59 |
+
|
| 60 |
+
app = gr.Interface(
|
| 61 |
+
fn=get_answer,
|
| 62 |
+
inputs=inputs,
|
| 63 |
+
outputs=output,
|
| 64 |
+
allow_flagging='never',
|
| 65 |
+
title="Entity Extraction from Text",
|
| 66 |
+
article=article,
|
| 67 |
+
enable_queue=True,
|
| 68 |
+
cache_examples=False)
|
| 69 |
+
|
| 70 |
+
app.launch()
|
configuration.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ====================================================
|
| 2 |
+
# CFG
|
| 3 |
+
# ====================================================
|
| 4 |
+
class CFG:
|
| 5 |
+
print_freq=100
|
| 6 |
+
num_workers=0
|
| 7 |
+
model="microsoft/deberta-base"
|
| 8 |
+
token="microsoft/deberta-base"
|
| 9 |
+
fc_dropout=0.2
|
| 10 |
+
max_len=739
|
| 11 |
+
weight_decay=0.01
|
| 12 |
+
project_folder = '/content/drive/MyDrive/Projects/Exigent/POC-V1/'
|
| 13 |
+
matching_data = 'matching_data.csv'
|
dataset.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch.utils.data import Dataset
|
| 2 |
+
|
| 3 |
+
# Create a custom dataset class that takes a single input sample
|
| 4 |
+
class SingleInputDataset(Dataset):
|
| 5 |
+
def __init__(self, input_single):
|
| 6 |
+
self.sample = input_single
|
| 7 |
+
|
| 8 |
+
def __len__(self):
|
| 9 |
+
return 1
|
| 10 |
+
|
| 11 |
+
def __getitem__(self, index):
|
| 12 |
+
return self.sample
|
model.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import AutoConfig, AutoModel
|
| 4 |
+
|
| 5 |
+
# ====================================================
|
| 6 |
+
# Model
|
| 7 |
+
# ====================================================
|
| 8 |
+
class CustomModel(nn.Module):
|
| 9 |
+
def __init__(self, cfg, config_path=None, pretrained=False):
|
| 10 |
+
super().__init__()
|
| 11 |
+
self.cfg = cfg
|
| 12 |
+
|
| 13 |
+
if config_path is None:
|
| 14 |
+
self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
|
| 15 |
+
else:
|
| 16 |
+
self.config = torch.load(config_path)
|
| 17 |
+
if pretrained:
|
| 18 |
+
self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
|
| 19 |
+
else:
|
| 20 |
+
self.model = AutoModel.from_config(self.config)
|
| 21 |
+
self.fc_dropout = nn.Dropout(cfg.fc_dropout)
|
| 22 |
+
self.fc = nn.Linear(self.config.hidden_size, 1)
|
| 23 |
+
self._init_weights(self.fc)
|
| 24 |
+
|
| 25 |
+
def _init_weights(self, module):
|
| 26 |
+
if isinstance(module, nn.Linear):
|
| 27 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
| 28 |
+
if module.bias is not None:
|
| 29 |
+
module.bias.data.zero_()
|
| 30 |
+
elif isinstance(module, nn.Embedding):
|
| 31 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
| 32 |
+
if module.padding_idx is not None:
|
| 33 |
+
module.weight.data[module.padding_idx].zero_()
|
| 34 |
+
elif isinstance(module, nn.LayerNorm):
|
| 35 |
+
module.bias.data.zero_()
|
| 36 |
+
module.weight.data.fill_(1.0)
|
| 37 |
+
|
| 38 |
+
def feature(self, inputs):
|
| 39 |
+
outputs = self.model(**inputs)
|
| 40 |
+
last_hidden_states = outputs[0]
|
| 41 |
+
return last_hidden_states
|
| 42 |
+
|
| 43 |
+
def forward(self, inputs):
|
| 44 |
+
feature = self.feature(inputs)
|
| 45 |
+
output = self.fc(self.fc_dropout(feature))
|
| 46 |
+
return output
|
models_file/config.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44242dd46e256e33385a5be4979c8df941af4ae4d8ad5f2feb5315d114da5f98
|
| 3 |
+
size 2541
|
models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:994ef334eed041e7b0d62f2ad3f97444adcac4696a8027a5b14bf803bb27265f
|
| 3 |
+
size 555618276
|
models_file/tokenizer/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models_file/tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "[CLS]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "[SEP]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": true,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "[MASK]",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": true,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "[PAD]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": true,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "[SEP]",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": true,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "[UNK]",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": true,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
models_file/tokenizer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models_file/tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"bos_token": {
|
| 5 |
+
"__type": "AddedToken",
|
| 6 |
+
"content": "[CLS]",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": true,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false
|
| 11 |
+
},
|
| 12 |
+
"clean_up_tokenization_spaces": true,
|
| 13 |
+
"cls_token": {
|
| 14 |
+
"__type": "AddedToken",
|
| 15 |
+
"content": "[CLS]",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false
|
| 20 |
+
},
|
| 21 |
+
"do_lower_case": false,
|
| 22 |
+
"eos_token": {
|
| 23 |
+
"__type": "AddedToken",
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": true,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"errors": "replace",
|
| 31 |
+
"mask_token": {
|
| 32 |
+
"__type": "AddedToken",
|
| 33 |
+
"content": "[MASK]",
|
| 34 |
+
"lstrip": true,
|
| 35 |
+
"normalized": true,
|
| 36 |
+
"rstrip": false,
|
| 37 |
+
"single_word": false
|
| 38 |
+
},
|
| 39 |
+
"model_max_length": 512,
|
| 40 |
+
"pad_token": {
|
| 41 |
+
"__type": "AddedToken",
|
| 42 |
+
"content": "[PAD]",
|
| 43 |
+
"lstrip": false,
|
| 44 |
+
"normalized": true,
|
| 45 |
+
"rstrip": false,
|
| 46 |
+
"single_word": false
|
| 47 |
+
},
|
| 48 |
+
"sep_token": {
|
| 49 |
+
"__type": "AddedToken",
|
| 50 |
+
"content": "[SEP]",
|
| 51 |
+
"lstrip": false,
|
| 52 |
+
"normalized": true,
|
| 53 |
+
"rstrip": false,
|
| 54 |
+
"single_word": false
|
| 55 |
+
},
|
| 56 |
+
"tokenizer_class": "DebertaTokenizer",
|
| 57 |
+
"unk_token": {
|
| 58 |
+
"__type": "AddedToken",
|
| 59 |
+
"content": "[UNK]",
|
| 60 |
+
"lstrip": false,
|
| 61 |
+
"normalized": true,
|
| 62 |
+
"rstrip": false,
|
| 63 |
+
"single_word": false
|
| 64 |
+
},
|
| 65 |
+
"vocab_type": "gpt2"
|
| 66 |
+
}
|
models_file/tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
+
gradio
|
utils.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from tqdm.auto import tqdm
|
| 5 |
+
|
| 6 |
+
def get_char_probs(texts, predictions, tokenizer):
|
| 7 |
+
"""
|
| 8 |
+
Maps prediction from encoded offset mapping to the text
|
| 9 |
+
|
| 10 |
+
Prediction = 466 sequence length * batch
|
| 11 |
+
text = 768 * batch
|
| 12 |
+
Using offset mapping [(0, 4), ] -- 466
|
| 13 |
+
|
| 14 |
+
creates results that is size of texts
|
| 15 |
+
|
| 16 |
+
for each text result[i]
|
| 17 |
+
result[0, 4] = pred[0] like wise for all
|
| 18 |
+
|
| 19 |
+
"""
|
| 20 |
+
results = [np.zeros(len(t)) for t in texts]
|
| 21 |
+
for i, (text, prediction) in enumerate(zip(texts, predictions)):
|
| 22 |
+
encoded = tokenizer(text,
|
| 23 |
+
add_special_tokens=True,
|
| 24 |
+
return_offsets_mapping=True)
|
| 25 |
+
for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
|
| 26 |
+
start = offset_mapping[0]
|
| 27 |
+
end = offset_mapping[1]
|
| 28 |
+
results[i][start:end] = pred
|
| 29 |
+
return results
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_results(char_probs, th=0.5):
|
| 33 |
+
"""
|
| 34 |
+
Get the list of probabilites with size of text
|
| 35 |
+
And then get the index of the characters which are more than th
|
| 36 |
+
example:
|
| 37 |
+
char_prob = [0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.7, 0.7, 0.7] ## length == 766
|
| 38 |
+
where > 0.5 index ## [ 2, 3, 4, 5, 9, 10, 11]
|
| 39 |
+
|
| 40 |
+
Groupby same one -- [[2, 3, 4, 5], [9, 10, 11]]
|
| 41 |
+
And get the max and min and output the results
|
| 42 |
+
|
| 43 |
+
"""
|
| 44 |
+
results = []
|
| 45 |
+
for char_prob in char_probs:
|
| 46 |
+
result = np.where(char_prob >= th)[0] + 1
|
| 47 |
+
result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
|
| 48 |
+
result = [f"{min(r)} {max(r)}" for r in result]
|
| 49 |
+
result = ";".join(result)
|
| 50 |
+
results.append(result)
|
| 51 |
+
return results
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def get_predictions(results):
|
| 55 |
+
"""
|
| 56 |
+
Will get the location, as a string, just like location in the df
|
| 57 |
+
results = ['2 5', '9 11']
|
| 58 |
+
|
| 59 |
+
loop through, split it and save it as start and end and store it in array
|
| 60 |
+
"""
|
| 61 |
+
predictions = []
|
| 62 |
+
for result in results:
|
| 63 |
+
prediction = []
|
| 64 |
+
if result != "":
|
| 65 |
+
for loc in [s.split() for s in result.split(';')]:
|
| 66 |
+
start, end = int(loc[0]), int(loc[1])
|
| 67 |
+
prediction.append([start, end])
|
| 68 |
+
predictions.append(prediction)
|
| 69 |
+
return predictions
|
| 70 |
+
|
| 71 |
+
def inference_fn(test_loader, model, device):
|
| 72 |
+
preds = []
|
| 73 |
+
model.eval()
|
| 74 |
+
model.to(device)
|
| 75 |
+
tk0 = tqdm(test_loader, total=len(test_loader))
|
| 76 |
+
for inputs in tk0:
|
| 77 |
+
for k, v in inputs.items():
|
| 78 |
+
inputs[k] = v.to(device)
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
y_preds = model(inputs)
|
| 81 |
+
preds.append(y_preds.sigmoid().numpy())
|
| 82 |
+
predictions = np.concatenate(preds)
|
| 83 |
+
return predictions
|
| 84 |
+
|
| 85 |
+
def get_text(context, indexes):
|
| 86 |
+
if (indexes):
|
| 87 |
+
if ';' in indexes:
|
| 88 |
+
list_indexes = indexes.split(';')
|
| 89 |
+
|
| 90 |
+
answer = ''
|
| 91 |
+
for idx in list_indexes:
|
| 92 |
+
start_index = int(idx.split(' ')[0])
|
| 93 |
+
end_index = int(idx.split(' ')[1])
|
| 94 |
+
answer += ' '
|
| 95 |
+
answer += context[start_index:end_index]
|
| 96 |
+
return answer
|
| 97 |
+
else:
|
| 98 |
+
start_index = int(indexes.split(' ')[0])
|
| 99 |
+
end_index = int(indexes.split(' ')[1])
|
| 100 |
+
|
| 101 |
+
return context[start_index:end_index]
|
| 102 |
+
else:
|
| 103 |
+
return 'Not found in this Context'
|
| 104 |
+
|