Spaces:
Sleeping
Sleeping
Update streamlit_langchain_chat/dataset.py
Browse files
streamlit_langchain_chat/dataset.py
CHANGED
|
@@ -98,8 +98,6 @@ def parse_docx(path, citation, key, chunk_chars=2000, overlap=50):
|
|
| 98 |
return [], []
|
| 99 |
|
| 100 |
|
| 101 |
-
# TODO: si pones un conector con el formato loader = ... ; data = loader.load();
|
| 102 |
-
# podrás poner todos los conectores de langchain
|
| 103 |
# https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
|
| 104 |
def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
| 105 |
pdfFileObj = open(path, "rb")
|
|
@@ -111,9 +109,6 @@ def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
|
| 111 |
for i, page in enumerate(pdfReader.pages):
|
| 112 |
split += page.extract_text()
|
| 113 |
pages.append(str(i + 1))
|
| 114 |
-
# split could be so long it needs to be split
|
| 115 |
-
# into multiple chunks. Or it could be so short
|
| 116 |
-
# that it needs to be combined with the next chunk.
|
| 117 |
while len(split) > chunk_chars:
|
| 118 |
splits.append(split[:chunk_chars])
|
| 119 |
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
|
@@ -184,7 +179,6 @@ def parse_txt(path, citation, key, chunk_chars=2000, overlap=50, html=False):
|
|
| 184 |
doc = f.read()
|
| 185 |
if html:
|
| 186 |
doc = html2text(doc)
|
| 187 |
-
# yo, no idea why but the texts are not split correctly
|
| 188 |
text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
|
| 189 |
texts = text_splitter.split_text(doc)
|
| 190 |
return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
|
|
@@ -289,8 +283,8 @@ def read_source(path: str = None,
|
|
| 289 |
return parse_pptx(path, citation, key, chunk_chars, overlap)
|
| 290 |
elif path.startswith("http://") or path.startswith("https://"):
|
| 291 |
return parse_url(path, citation, key, chunk_chars, overlap)
|
| 292 |
-
#
|
| 293 |
-
#
|
| 294 |
# return parse_code_txt(path, citation, key, chunk_chars, overlap)
|
| 295 |
else:
|
| 296 |
raise "unknown extension"
|
|
@@ -510,20 +504,6 @@ class Dataset:
|
|
| 510 |
lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
|
| 511 |
)
|
| 512 |
|
| 513 |
-
# TODO: que cuando exista que no lo borre, sino que lo actualice
|
| 514 |
-
# index_name = "langchain-demo1"
|
| 515 |
-
# if index_name in pinecone.list_indexes():
|
| 516 |
-
# self.index_docstore = pinecone.Index(index_name)
|
| 517 |
-
# vectors = []
|
| 518 |
-
# for text, metadata in zip(texts, metadatas):
|
| 519 |
-
# # embed = <faltaria saber con que embedding se hizo el index que ya existia>
|
| 520 |
-
# self.index_docstore.upsert(vectors=vectors)
|
| 521 |
-
# else:
|
| 522 |
-
# if openai.api_type == 'azure':
|
| 523 |
-
# self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
|
| 524 |
-
# else:
|
| 525 |
-
# self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
|
| 526 |
-
|
| 527 |
index_name = "langchain-demo1"
|
| 528 |
|
| 529 |
# if the index exists, delete it
|
|
@@ -593,7 +573,6 @@ class Dataset:
|
|
| 593 |
break
|
| 594 |
if OPERATING_MODE == "debug":
|
| 595 |
print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
|
| 596 |
-
# no se printea el ultimo caracter porque es un \n
|
| 597 |
print(partial_summary_time[:-1])
|
| 598 |
context_str = "\n\n".join(
|
| 599 |
[f"{citation}: {summary_of_chunked_text}"
|
|
@@ -693,7 +672,7 @@ class Dataset:
|
|
| 693 |
'total_tokens': cb.total_tokens
|
| 694 |
})
|
| 695 |
|
| 696 |
-
# it still happens
|
| 697 |
if "(Foo2012)" in answer_text:
|
| 698 |
answer_text = answer_text.replace("(Foo2012)", "")
|
| 699 |
for key, citation, summary, text in answer.packages:
|
|
|
|
| 98 |
return [], []
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
| 101 |
# https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
|
| 102 |
def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
|
| 103 |
pdfFileObj = open(path, "rb")
|
|
|
|
| 109 |
for i, page in enumerate(pdfReader.pages):
|
| 110 |
split += page.extract_text()
|
| 111 |
pages.append(str(i + 1))
|
|
|
|
|
|
|
|
|
|
| 112 |
while len(split) > chunk_chars:
|
| 113 |
splits.append(split[:chunk_chars])
|
| 114 |
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
|
|
|
| 179 |
doc = f.read()
|
| 180 |
if html:
|
| 181 |
doc = html2text(doc)
|
|
|
|
| 182 |
text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
|
| 183 |
texts = text_splitter.split_text(doc)
|
| 184 |
return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
|
|
|
|
| 283 |
return parse_pptx(path, citation, key, chunk_chars, overlap)
|
| 284 |
elif path.startswith("http://") or path.startswith("https://"):
|
| 285 |
return parse_url(path, citation, key, chunk_chars, overlap)
|
| 286 |
+
# WIP
|
| 287 |
+
#else:
|
| 288 |
# return parse_code_txt(path, citation, key, chunk_chars, overlap)
|
| 289 |
else:
|
| 290 |
raise "unknown extension"
|
|
|
|
| 504 |
lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
|
| 505 |
)
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
index_name = "langchain-demo1"
|
| 508 |
|
| 509 |
# if the index exists, delete it
|
|
|
|
| 573 |
break
|
| 574 |
if OPERATING_MODE == "debug":
|
| 575 |
print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
|
|
|
|
| 576 |
print(partial_summary_time[:-1])
|
| 577 |
context_str = "\n\n".join(
|
| 578 |
[f"{citation}: {summary_of_chunked_text}"
|
|
|
|
| 672 |
'total_tokens': cb.total_tokens
|
| 673 |
})
|
| 674 |
|
| 675 |
+
# it still happens ulol
|
| 676 |
if "(Foo2012)" in answer_text:
|
| 677 |
answer_text = answer_text.replace("(Foo2012)", "")
|
| 678 |
for key, citation, summary, text in answer.packages:
|