Spaces:

hlydecker
/

RA-document-QAchat

Sleeping

App Files Files Community

hlydecker commited on Sep 14, 2023

Commit

892178c

1 Parent(s): 55a9545

Update streamlit_langchain_chat/dataset.py

Browse files

Files changed (1) hide show

streamlit_langchain_chat/dataset.py +3 -24

streamlit_langchain_chat/dataset.py CHANGED Viewed

@@ -98,8 +98,6 @@ def parse_docx(path, citation, key, chunk_chars=2000, overlap=50):
         return [], []
-# TODO: si pones un conector con el formato loader = ... ; data = loader.load();
-#  podrás poner todos los conectores de langchain
 # https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
 def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
     pdfFileObj = open(path, "rb")
@@ -111,9 +109,6 @@ def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
     for i, page in enumerate(pdfReader.pages):
         split += page.extract_text()
         pages.append(str(i + 1))
-        # split could be so long it needs to be split
-        # into multiple chunks. Or it could be so short
-        # that it needs to be combined with the next chunk.
         while len(split) > chunk_chars:
             splits.append(split[:chunk_chars])
             # pretty formatting of pages (e.g. 1-3, 4, 5-7)
@@ -184,7 +179,6 @@ def parse_txt(path, citation, key, chunk_chars=2000, overlap=50, html=False):
             doc = f.read()
     if html:
         doc = html2text(doc)
-    # yo, no idea why but the texts are not split correctly
     text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
     texts = text_splitter.split_text(doc)
     return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
@@ -289,8 +283,8 @@ def read_source(path: str = None,
         return parse_pptx(path, citation, key, chunk_chars, overlap)
     elif path.startswith("http://") or path.startswith("https://"):
         return parse_url(path, citation, key, chunk_chars, overlap)
-    # TODO: poner mas conectores
-    # else:
     #     return parse_code_txt(path, citation, key, chunk_chars, overlap)
     else:
         raise "unknown extension"
@@ -510,20 +504,6 @@ class Dataset:
                 lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
             )
-            # TODO: que cuando exista que no lo borre, sino que lo actualice
-            # index_name = "langchain-demo1"
-            # if index_name in pinecone.list_indexes():
-            #     self.index_docstore = pinecone.Index(index_name)
-            #     vectors = []
-            #     for text, metadata in zip(texts, metadatas):
-            #         # embed = <faltaria saber con que embedding se hizo el index que ya existia>
-            #     self.index_docstore.upsert(vectors=vectors)
-            # else:
-            #     if openai.api_type == 'azure':
-            #         self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
-            #     else:
-            #         self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
             index_name = "langchain-demo1"
             # if the index exists, delete it
@@ -593,7 +573,6 @@ class Dataset:
                 break
         if OPERATING_MODE == "debug":
             print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
-            # no se printea el ultimo caracter porque es un \n
             print(partial_summary_time[:-1])
         context_str = "\n\n".join(
             [f"{citation}: {summary_of_chunked_text}"
@@ -693,7 +672,7 @@ class Dataset:
                         'total_tokens': cb.total_tokens
                     })
-        # it still happens lol
         if "(Foo2012)" in answer_text:
             answer_text = answer_text.replace("(Foo2012)", "")
         for key, citation, summary, text in answer.packages:

         return [], []
 # https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
 def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
     pdfFileObj = open(path, "rb")
     for i, page in enumerate(pdfReader.pages):
         split += page.extract_text()
         pages.append(str(i + 1))
         while len(split) > chunk_chars:
             splits.append(split[:chunk_chars])
             # pretty formatting of pages (e.g. 1-3, 4, 5-7)
             doc = f.read()
     if html:
         doc = html2text(doc)
     text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
     texts = text_splitter.split_text(doc)
     return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
         return parse_pptx(path, citation, key, chunk_chars, overlap)
     elif path.startswith("http://") or path.startswith("https://"):
         return parse_url(path, citation, key, chunk_chars, overlap)
+    # WIP
+    #else:
     #     return parse_code_txt(path, citation, key, chunk_chars, overlap)
     else:
         raise "unknown extension"
                 lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
             )
             index_name = "langchain-demo1"
             # if the index exists, delete it
                 break
         if OPERATING_MODE == "debug":
             print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
             print(partial_summary_time[:-1])
         context_str = "\n\n".join(
             [f"{citation}: {summary_of_chunked_text}"
                         'total_tokens': cb.total_tokens
                     })
+        # it still happens ulol
         if "(Foo2012)" in answer_text:
             answer_text = answer_text.replace("(Foo2012)", "")
         for key, citation, summary, text in answer.packages: