Spaces:

xelpmocAI
/

entity-extraction

Runtime error

App Files Files Community

RishuD7 commited on Jun 8, 2023

Commit

122d428

1 Parent(s): 140fcb7

first commit

Browse files

Files changed (16) hide show

.dockerignore +20 -0
.gitignore +160 -0
README.md +5 -5
app.py +70 -0
configuration.py +13 -0
dataset.py +12 -0
model.py +46 -0
models_file/config.pth +3 -0
models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth +3 -0
models_file/tokenizer/merges.txt +0 -0
models_file/tokenizer/special_tokens_map.json +51 -0
models_file/tokenizer/tokenizer.json +0 -0
models_file/tokenizer/tokenizer_config.json +66 -0
models_file/tokenizer/vocab.json +0 -0
requirements.txt +5 -0
utils.py +104 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Name Entity Recognition
-emoji: 🏢
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 3.34.0
 app_file: app.py
 pinned: false
 ---

 ---
+title: Entity Extraction
+emoji: 📚
+colorFrom: red
+colorTo: yellow
 sdk: gradio
+sdk_version: 3.28.3
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+if __name__ == '__main__':
+    inputs = ['gbjjhbdjhbdgjhdbfjhsdkjrkjf', 'fdjhbjhsbd']
+    from transformers import AutoTokenizer
+    from model import CustomModel
+    import torch
+    from configuration import CFG
+    from dataset import SingleInputDataset
+    from torch.utils.data import DataLoader
+    from utils import inference_fn, get_char_probs, get_results, get_text
+    import numpy as np
+    import gradio as gr
+    import os
+    device = torch.device('cpu')
+    config_path = os.path.join('models_file', 'config.pth')
+    model_path  = os.path.join('models_file', 'microsoft-deberta-base_0.9449373420387531_8_best.pth')
+    tokenizer = AutoTokenizer.from_pretrained('models_file/tokenizer')
+    model = CustomModel(CFG, config_path=config_path, pretrained=False)
+    state = torch.load(model_path,
+                        map_location=torch.device('cpu'))
+    model.load_state_dict(state['model'])
+    def get_answer(context, feature):
+        ## Input to the model using patient-history and feature-text
+        inputs_single = tokenizer(context, feature,
+                                    add_special_tokens=True,
+                                    max_length=CFG.max_len,
+                                    padding="max_length",
+                                    return_offsets_mapping=False)
+        for k, v in inputs_single.items():
+            inputs_single[k] = torch.tensor(v, dtype=torch.long)
+        # Create a new dataset containing only the input sample
+        single_input_dataset = SingleInputDataset(inputs_single)
+        # Create a DataLoader for the new dataset
+        single_input_loader = DataLoader(
+                                            single_input_dataset,
+                                            batch_size=1,
+                                            shuffle=False,
+                                            num_workers=2
+                                        )
+        # Perform inference on the single input
+        output = inference_fn(single_input_loader, model, device)
+        prediction = output.reshape((1, CFG.max_len))
+        char_probs = get_char_probs([context], prediction, tokenizer)
+        predictions = np.mean([char_probs], axis=0)
+        results = get_results(predictions, th=0.5)
+        print(results)
+        return get_text(context, results[0])
+    inputs = [gr.inputs.Textbox(label="Context Para", lines=10), gr.inputs.Textbox(label="Question", lines=1)]
+    output = gr.outputs.Textbox(label="Answer")
+    article = "<p style='text-align: center'><a href='https://www.xelpmoc.in/' target='_blank'>Made by Xelpmoc</a></p>"
+    app = gr.Interface(
+        fn=get_answer,
+        inputs=inputs,
+        outputs=output,
+        allow_flagging='never',
+        title="Entity Extraction from Text",
+        article=article,
+        enable_queue=True,
+        cache_examples=False)
+    app.launch()

configuration.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ====================================================
+# CFG
+# ====================================================
+class CFG:
+    print_freq=100
+    num_workers=0
+    model="microsoft/deberta-base"
+    token="microsoft/deberta-base"
+    fc_dropout=0.2
+    max_len=739
+    weight_decay=0.01
+    project_folder = '/content/drive/MyDrive/Projects/Exigent/POC-V1/'
+    matching_data = 'matching_data.csv'

dataset.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from torch.utils.data import Dataset
+# Create a custom dataset class that takes a single input sample
+class SingleInputDataset(Dataset):
+    def __init__(self, input_single):
+        self.sample = input_single
+    def __len__(self):
+        return 1
+    def __getitem__(self, index):
+        return self.sample

model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel
+# ====================================================
+# Model
+# ====================================================
+class CustomModel(nn.Module):
+    def __init__(self, cfg, config_path=None, pretrained=False):
+        super().__init__()
+        self.cfg = cfg
+        if config_path is None:
+            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
+        else:
+            self.config = torch.load(config_path)
+        if pretrained:
+            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
+        else:
+            self.model = AutoModel.from_config(self.config)
+        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
+        self.fc = nn.Linear(self.config.hidden_size, 1)
+        self._init_weights(self.fc)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def feature(self, inputs):
+        outputs = self.model(**inputs)
+        last_hidden_states = outputs[0]
+        return last_hidden_states
+    def forward(self, inputs):
+        feature = self.feature(inputs)
+        output = self.fc(self.fc_dropout(feature))
+        return output

models_file/config.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44242dd46e256e33385a5be4979c8df941af4ae4d8ad5f2feb5315d114da5f98
+size 2541

models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:994ef334eed041e7b0d62f2ad3f97444adcac4696a8027a5b14bf803bb27265f
+size 555618276

models_file/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models_file/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models_file/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models_file/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "tokenizer_class": "DebertaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "vocab_type": "gpt2"
+}

models_file/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+numpy
+scikit-learn
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import itertools
+import torch
+import numpy as np
+from tqdm.auto import tqdm
+def get_char_probs(texts, predictions, tokenizer):
+    """
+    Maps prediction from encoded offset mapping to the text
+    Prediction = 466 sequence length * batch
+    text = 768 * batch
+    Using offset mapping [(0, 4), ] -- 466
+    creates results that is size of texts
+    for each text result[i]
+    result[0, 4] = pred[0] like wise for all
+    """
+    results = [np.zeros(len(t)) for t in texts]
+    for i, (text, prediction) in enumerate(zip(texts, predictions)):
+        encoded = tokenizer(text,
+                            add_special_tokens=True,
+                            return_offsets_mapping=True)
+        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
+            start = offset_mapping[0]
+            end = offset_mapping[1]
+            results[i][start:end] = pred
+    return results
+def get_results(char_probs, th=0.5):
+    """
+      Get the list of probabilites with size of text
+      And then get the index of the characters which are more than th
+      example:
+          char_prob = [0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.7, 0.7, 0.7] ## length == 766
+          where > 0.5 index ## [ 2,  3,  4,  5,  9, 10, 11]
+          Groupby same one -- [[2, 3, 4, 5], [9, 10, 11]]
+          And get the max and min and output the results
+    """
+    results = []
+    for char_prob in char_probs:
+        result = np.where(char_prob >= th)[0] + 1
+        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
+        result = [f"{min(r)} {max(r)}" for r in result]
+        result = ";".join(result)
+        results.append(result)
+    return results
+def get_predictions(results):
+    """
+      Will get the location, as a string, just like location in the df
+      results = ['2 5', '9 11']
+      loop through, split it and save it as start and end and store it in array
+    """
+    predictions = []
+    for result in results:
+        prediction = []
+        if result != "":
+            for loc in [s.split() for s in result.split(';')]:
+                start, end = int(loc[0]), int(loc[1])
+                prediction.append([start, end])
+        predictions.append(prediction)
+    return predictions
+def inference_fn(test_loader, model, device):
+    preds = []
+    model.eval()
+    model.to(device)
+    tk0 = tqdm(test_loader, total=len(test_loader))
+    for inputs in tk0:
+        for k, v in inputs.items():
+            inputs[k] = v.to(device)
+        with torch.no_grad():
+            y_preds = model(inputs)
+        preds.append(y_preds.sigmoid().numpy())
+    predictions = np.concatenate(preds)
+    return predictions
+def get_text(context, indexes):
+  if (indexes):
+      if ';' in indexes:
+        list_indexes = indexes.split(';')
+        answer = ''
+        for idx in list_indexes:
+          start_index = int(idx.split(' ')[0])
+          end_index = int(idx.split(' ')[1])
+          answer += ' '
+          answer += context[start_index:end_index]
+        return answer
+      else:
+        start_index = int(indexes.split(' ')[0])
+        end_index = int(indexes.split(' ')[1])
+        return context[start_index:end_index]
+  else:
+    return 'Not found in this Context'