Ferdlance commited on
Commit
f710534
·
verified ·
1 Parent(s): 093e312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -148
app.py CHANGED
@@ -16,7 +16,7 @@ import pandas as pd
16
  import plotly.express as px
17
  import plotly.graph_objects as go
18
  from bs4 import BeautifulSoup
19
- import html2text # CORRECTION 1: Import manquant ajouté
20
 
21
  # Importation du module de configuration
22
  from config import app_config as config
@@ -32,7 +32,7 @@ st.set_page_config(
32
  config.init_session_state()
33
 
34
  # Initialisation du parser HTML
35
- h = html2text.HTML2Text() # CORRECTION 1: Initialisation du parser
36
  h.ignore_links = True
37
 
38
  # Configuration du logging
@@ -209,7 +209,6 @@ class IAEnricher:
209
 
210
  if response_text:
211
  try:
212
- # CORRECTION 3: Remplacement du regex fragile par une recherche de délimiteurs JSON
213
  start = response_text.find('{')
214
  end = response_text.rfind('}')
215
  if start != -1 and end != -1:
@@ -238,7 +237,6 @@ class IAEnricher:
238
 
239
  if response_text:
240
  try:
241
- # CORRECTION 3: Remplacement du regex fragile
242
  start = response_text.find('{')
243
  end = response_text.rfind('}')
244
  if start != -1 and end != -1:
@@ -266,7 +264,6 @@ class IAEnricher:
266
 
267
  if response_text:
268
  try:
269
- # CORRECTION 3: Remplacement du regex fragile
270
  start = response_text.find('{')
271
  end = response_text.rfind('}')
272
  if start != -1 and end != -1:
@@ -289,14 +286,13 @@ def check_api_keys():
289
 
290
  valid_keys = {k: v for k, v in keys.items() if v and v != f'your_{k.lower()}_here'}
291
 
292
- config.USE_API_KEYS = len(valid_keys) == len(keys)
293
  if not config.USE_API_KEYS:
294
- missing = set(keys.keys()) - set(valid_keys.keys())
295
- logger.warning(f"Clés d'API manquantes ou non configurées: {', '.join(missing)}")
296
- logger.warning("Le bot fonctionnera en mode dégradé avec des pauses plus longues.")
297
  else:
298
- logger.info("Toutes les clés d'API sont configurées.")
299
- return config.USE_API_KEYS
 
300
 
301
  def make_request(url, headers=None, params=None, is_api_call=True):
302
  config.REQUEST_COUNT += 1
@@ -377,9 +373,14 @@ def save_qa_pair(question, answer, category, subcategory, source, attack_signatu
377
  except Exception as e:
378
  logger.error(f"Erreur lors de la sauvegarde du fichier {filename}: {str(e)}")
379
 
380
- def collect_kaggle_data(queries):
381
  logger.info("Début de la collecte des données Kaggle...")
382
 
 
 
 
 
 
383
  os.environ['KAGGLE_USERNAME'] = os.getenv('KAGGLE_USERNAME')
384
  os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')
385
  import kaggle
@@ -394,7 +395,12 @@ def collect_kaggle_data(queries):
394
  for query in list(set(search_queries)):
395
  logger.info(f"Recherche de datasets Kaggle pour: {query}")
396
  try:
397
- datasets = kaggle.api.dataset_list(search=query, max_results=5)
 
 
 
 
 
398
  for dataset in datasets:
399
  dataset_ref = dataset.ref
400
  if ia_enricher.available and st.session_state.enable_enrichment:
@@ -405,6 +411,7 @@ def collect_kaggle_data(queries):
405
 
406
  logger.info(f"Traitement du dataset: {dataset_ref}")
407
  download_dir = Path("data") / "security" / "kaggle" / dataset_ref.replace('/', '_')
 
408
  download_dir.mkdir(parents=True, exist_ok=True)
409
  kaggle.api.dataset_download_files(dataset_ref, path=download_dir, unzip=True)
410
 
@@ -427,142 +434,197 @@ def collect_kaggle_data(queries):
427
  logger.error(f"Erreur lors de la collecte des données Kaggle pour {query}: {str(e)}")
428
  logger.info("Collecte des données Kaggle terminée.")
429
 
430
- def collect_github_data(queries):
431
  logger.info("Début de la collecte des données GitHub...")
432
  base_url = "https://api.github.com"
433
  headers = {"Accept": "application/vnd.github.v3+json"}
434
- if config.USE_API_KEYS:
435
- token = os.getenv('GITHUB_API_TOKEN')
436
- headers["Authorization"] = f"token {token}"
 
 
 
 
 
437
 
438
  search_queries = queries.split('\n') if queries else ["topic:devsecops", "topic:security"]
439
 
440
  for query in search_queries:
441
- logger.info(f"Recherche de repositories pour: {query}")
442
- search_url = f"{base_url}/search/repositories"
443
- params = {"q": query, "sort": "stars", "per_page": 10}
444
- response = make_request(search_url, headers=headers, params=params)
445
- if not response:
446
- continue
447
 
448
- data = response.json()
449
- for repo in data.get("items", []):
450
- repo_name = repo["full_name"].replace("/", "_")
451
- logger.info(f"Traitement du repository: {repo['full_name']}")
452
 
453
- issues_url = f"{base_url}/repos/{repo['full_name']}/issues"
454
- issues_params = {"state": "closed", "labels": "security,bug,vulnerability", "per_page": 10}
455
- issues_response = make_request(issues_url, headers=headers, params=issues_params)
 
 
 
456
 
457
- if issues_response:
458
- issues_data = issues_response.json()
459
- for issue in issues_data:
460
- if "pull_request" in issue: continue
461
- question = issue.get("title", "")
462
- body = clean_html(issue.get("body", ""))
463
- if not question or not body or len(body) < 50: continue
464
-
465
- comments_url = issue.get("comments_url")
466
- comments_response = make_request(comments_url, headers=headers)
467
- answer_parts = []
468
- if comments_response:
469
- comments_data = comments_response.json()
470
- for comment in comments_data:
471
- comment_body = clean_html(comment.get("body", ""))
472
- if comment_body: answer_parts.append(comment_body)
473
-
474
- if answer_parts:
475
- answer = "\n\n".join(answer_parts)
476
- save_qa_pair(
477
- question=f"{question}: {body}", answer=answer, category="devsecops",
478
- subcategory="github", source=f"github_{repo_name}"
479
- )
480
- time.sleep(random.uniform(1, 3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
  logger.info("Collecte des données GitHub terminée.")
482
 
483
- def collect_huggingface_data(queries):
484
  logger.info("Début de la collecte des données Hugging Face...")
485
  base_url = "https://huggingface.co/api"
486
  headers = {"Accept": "application/json"}
487
- if config.USE_API_KEYS:
488
- token = os.getenv('HUGGINGFACE_API_TOKEN')
489
- headers["Authorization"] = f"Bearer {token}"
 
 
 
 
 
490
 
491
  search_queries = queries.split('\n') if queries else ["security", "devsecops"]
492
  for query in search_queries:
493
  logger.info(f"Recherche de datasets pour: {query}")
494
- search_url = f"{base_url}/datasets"
495
- params = {"search": query, "limit": 10}
496
- response = make_request(search_url, headers=headers, params=params)
497
- if not response: continue
498
 
499
- data = response.json()
500
- for dataset in data:
501
- dataset_id = dataset["id"].replace("/", "_")
502
- logger.info(f"Traitement du dataset: {dataset['id']}")
503
- dataset_url = f"{base_url}/datasets/{dataset['id']}"
504
- dataset_response = make_request(dataset_url, headers=headers)
505
 
506
- if dataset_response:
507
- dataset_data = dataset_response.json()
508
- description = clean_html(dataset_data.get("description", ""))
509
- if not description or len(description) < 100: continue
510
- tags = dataset_data.get("tags", [])
511
- tags_text = ", ".join(tags) if tags else "No tags"
512
- answer = f"Dataset: {dataset_data.get('id', '')}\nDownloads: {dataset_data.get('downloads', 0)}\nTags: {tags_text}\n\n{description}"
513
 
514
- save_qa_pair(
515
- question=f"What is the {dataset_data.get('id', '')} dataset about?", answer=answer,
516
- category="security", subcategory="dataset", source=f"huggingface_{dataset_id}", tags=tags
517
- )
518
- time.sleep(random.uniform(1, 3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  logger.info("Collecte des données Hugging Face terminée.")
520
 
521
- def collect_nvd_data():
522
  logger.info("Début de la collecte des données NVD...")
523
  base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
524
  headers = {"Accept": "application/json"}
525
- if config.USE_API_KEYS:
526
- key = os.getenv('NVD_API_KEY')
527
- headers["apiKey"] = key
528
-
529
- params = {"resultsPerPage": 50}
530
- response = make_request(base_url, headers=headers, params=params)
531
- if not response:
532
- logger.warning("Impossible de récupérer les données du NVD.")
533
- return
534
 
535
- data = response.json()
536
- vulnerabilities = data.get("vulnerabilities", [])
537
- logger.info(f"Traitement de {len(vulnerabilities)} vulnérabilités...")
 
 
 
 
538
 
539
- for vuln in vulnerabilities:
540
- cve_data = vuln.get("cve", {})
541
- cve_id = cve_data.get("id", "")
542
- descriptions = cve_data.get("descriptions", [])
543
- description = next((desc.get("value", "") for desc in descriptions if desc.get("lang") == "en"), "")
544
- if not description or len(description) < 50: continue
545
-
546
- cvss_v3 = cve_data.get("metrics", {}).get("cvssMetricV31", [{}])[0].get("cvssData", {})
547
- severity = cvss_v3.get("baseSeverity", "UNKNOWN")
548
- score = cvss_v3.get("baseScore", 0)
549
- references = [ref.get("url", "") for ref in cve_data.get("references", [])]
550
 
551
- answer = f"CVE ID: {cve_id}\nSeverity: {severity}\nCVSS Score: {score}\nReferences: {', '.join(references[:5])}\n\nDescription: {description}"
552
-
553
- save_qa_pair(
554
- question=f"What is the vulnerability {cve_id}?", answer=answer,
555
- category="security", subcategory="vulnerability", source=f"nvd_{cve_id}"
556
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  logger.info("Collecte des données NVD terminée.")
558
 
559
- def collect_stack_exchange_data(queries):
560
  logger.info("Début de la collecte des données Stack Exchange...")
561
  base_url = "https://api.stackexchange.com/2.3"
562
- params_base = {"pagesize": 10, "order": "desc", "sort": "votes", "filter": "withbody"}
563
- if config.USE_API_KEYS:
564
- key = os.getenv('STACK_EXCHANGE_API_KEY')
565
- params_base["key"] = key
 
 
 
 
 
566
 
567
  sites = [
568
  {"site": "security", "category": "security", "subcategory": "security"},
@@ -582,40 +644,48 @@ def collect_stack_exchange_data(queries):
582
  tags = tags_by_site.get(site, []) + (queries.split('\n') if queries else [])
583
 
584
  for tag in list(set(tags)):
585
- logger.info(f"Recherche de questions avec le tag: {tag}")
586
  questions_url = f"{base_url}/questions"
587
- params = {**params_base, "site": site, "tagged": tag}
588
-
589
- response = make_request(questions_url, params=params)
590
- if not response: continue
591
 
592
- questions_data = response.json()
593
- for question in questions_data.get("items", []):
594
- question_id = question.get("question_id")
595
- title = question.get("title", "")
596
- body = clean_html(question.get("body", ""))
597
- if not body or len(body) < 50: continue
598
 
599
- answers_url = f"{base_url}/questions/{question_id}/answers"
600
- answers_params = {**params_base, "site": site}
601
- answers_response = make_request(answers_url, params=answers_params)
602
- answer_body = ""
603
- if answers_response and answers_response.json().get("items"):
604
- answer_body = clean_html(answers_response.json()["items"][0].get("body", ""))
605
 
606
- if answer_body:
607
- save_qa_pair(
608
- question=title, answer=answer_body, category=category,
609
- subcategory=subcategory, source=f"{site}_{question_id}", tags=question.get("tags", [])
610
- )
611
- time.sleep(random.uniform(1, 3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  logger.info("Collecte des données Stack Exchange terminée.")
613
 
614
- def run_data_collection(sources, queries):
615
  st.session_state.bot_status = "En cours d'exécution"
616
  st.session_state.logs = []
617
 
618
- check_api_keys()
619
 
620
  progress_bar = st.progress(0)
621
  status_text = st.empty()
@@ -625,18 +695,34 @@ def run_data_collection(sources, queries):
625
  completed_sources = 0
626
 
627
  for source_name in enabled_sources:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  status_text.text(f"Collecte des données de {source_name}...")
629
  try:
630
  if source_name == "Kaggle":
631
- collect_kaggle_data(queries.get("Kaggle", ""))
632
  elif source_name == "GitHub":
633
- collect_github_data(queries.get("GitHub", ""))
634
  elif source_name == "Hugging Face":
635
- collect_huggingface_data(queries.get("Hugging Face", ""))
636
  elif source_name == "NVD":
637
- collect_nvd_data()
638
  elif source_name == "Stack Exchange":
639
- collect_stack_exchange_data(queries.get("Stack Exchange", ""))
640
  except Exception as e:
641
  logger.error(f"Erreur fatale lors de la collecte de {source_name}: {str(e)}")
642
 
@@ -648,7 +734,6 @@ def run_data_collection(sources, queries):
648
  progress_bar.empty()
649
  status_text.empty()
650
 
651
- # CORRECTION 2: Forcer le rafraîchissement de l'UI pour afficher les résultats
652
  st.rerun()
653
 
654
  def main():
@@ -672,6 +757,19 @@ def main():
672
 
673
  st.markdown("---")
674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  st.header("Lancer la collecte")
676
 
677
  st.subheader("Sources de données")
@@ -689,6 +787,7 @@ def main():
689
  queries["GitHub"] = st.text_area("Requêtes GitHub (une par ligne)", "topic:devsecops\ntopic:security\nvulnerability")
690
  queries["Kaggle"] = st.text_area("Requêtes Kaggle (une par ligne)", "cybersecurity\nvulnerability dataset\npenetration testing")
691
  queries["Hugging Face"] = st.text_area("Requêtes Hugging Face (une par ligne)", "security dataset\nvulnerability\nlanguage model security")
 
692
  queries["Stack Exchange"] = st.text_area("Tags Stack Exchange (un par ligne)", "devsecops\nsecurity\nvulnerability")
693
 
694
  st.markdown("---")
@@ -698,7 +797,7 @@ def main():
698
  st.session_state.logs = []
699
  st.session_state.qa_data = []
700
  st.session_state.total_qa_pairs = 0
701
- run_data_collection(sources, queries)
702
  else:
703
  st.warning("La collecte est en cours. Veuillez attendre qu'elle se termine.")
704
  if st.button("Forcer l'arrêt", use_container_width=True, type="secondary"):
@@ -779,4 +878,4 @@ def main():
779
  st.session_state.n_predict = st.slider("Nombre de tokens", 128, 1024, st.session_state.n_predict, help="Nombre maximum de tokens à générer par l'IA.")
780
 
781
  if __name__ == "__main__":
782
- main()
 
16
  import plotly.express as px
17
  import plotly.graph_objects as go
18
  from bs4 import BeautifulSoup
19
+ import html2text
20
 
21
  # Importation du module de configuration
22
  from config import app_config as config
 
32
  config.init_session_state()
33
 
34
  # Initialisation du parser HTML
35
+ h = html2text.HTML2Text()
36
  h.ignore_links = True
37
 
38
  # Configuration du logging
 
209
 
210
  if response_text:
211
  try:
 
212
  start = response_text.find('{')
213
  end = response_text.rfind('}')
214
  if start != -1 and end != -1:
 
237
 
238
  if response_text:
239
  try:
 
240
  start = response_text.find('{')
241
  end = response_text.rfind('}')
242
  if start != -1 and end != -1:
 
264
 
265
  if response_text:
266
  try:
 
267
  start = response_text.find('{')
268
  end = response_text.rfind('}')
269
  if start != -1 and end != -1:
 
286
 
287
  valid_keys = {k: v for k, v in keys.items() if v and v != f'your_{k.lower()}_here'}
288
 
289
+ config.USE_API_KEYS = len(valid_keys) > 0
290
  if not config.USE_API_KEYS:
291
+ logger.warning("Aucune clé d'API valide trouvée. Le bot fonctionnera en mode dégradé avec des pauses plus longues.")
 
 
292
  else:
293
+ logger.info(f"Clés d'API valides trouvées pour: {', '.join(valid_keys.keys())}.")
294
+
295
+ return valid_keys
296
 
297
  def make_request(url, headers=None, params=None, is_api_call=True):
298
  config.REQUEST_COUNT += 1
 
373
  except Exception as e:
374
  logger.error(f"Erreur lors de la sauvegarde du fichier {filename}: {str(e)}")
375
 
376
+ def collect_kaggle_data(queries, num_pages, results_per_page):
377
  logger.info("Début de la collecte des données Kaggle...")
378
 
379
+ if not os.getenv('KAGGLE_USERNAME') or not os.getenv('KAGGLE_KEY'):
380
+ logger.warning("Clés Kaggle non configurées. La collecte Kaggle est ignorée.")
381
+ st.session_state.logs.append("ATTENTION: Clés Kaggle non configurées. Collecte ignorée.")
382
+ return
383
+
384
  os.environ['KAGGLE_USERNAME'] = os.getenv('KAGGLE_USERNAME')
385
  os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')
386
  import kaggle
 
395
  for query in list(set(search_queries)):
396
  logger.info(f"Recherche de datasets Kaggle pour: {query}")
397
  try:
398
+ # Kaggle API ne supporte pas la pagination et "results_per_page"
399
+ datasets = kaggle.api.dataset_list(search=query, max_results=results_per_page)
400
+ if not datasets:
401
+ logger.info(f"Aucun dataset trouvé pour la requête '{query}'.")
402
+ continue
403
+
404
  for dataset in datasets:
405
  dataset_ref = dataset.ref
406
  if ia_enricher.available and st.session_state.enable_enrichment:
 
411
 
412
  logger.info(f"Traitement du dataset: {dataset_ref}")
413
  download_dir = Path("data") / "security" / "kaggle" / dataset_ref.replace('/', '_')
414
+ shutil.rmtree(download_dir, ignore_errors=True) # S'assurer que le dossier est vide
415
  download_dir.mkdir(parents=True, exist_ok=True)
416
  kaggle.api.dataset_download_files(dataset_ref, path=download_dir, unzip=True)
417
 
 
434
  logger.error(f"Erreur lors de la collecte des données Kaggle pour {query}: {str(e)}")
435
  logger.info("Collecte des données Kaggle terminée.")
436
 
437
+ def collect_github_data(queries, num_pages, results_per_page):
438
  logger.info("Début de la collecte des données GitHub...")
439
  base_url = "https://api.github.com"
440
  headers = {"Accept": "application/vnd.github.v3+json"}
441
+
442
+ github_token = os.getenv('GITHUB_API_TOKEN')
443
+ if github_token:
444
+ headers["Authorization"] = f"token {github_token}"
445
+ else:
446
+ logger.warning("Clé GitHub non configurée. La collecte GitHub est ignorée.")
447
+ st.session_state.logs.append("ATTENTION: Clé GitHub non configurée. Collecte ignorée.")
448
+ return
449
 
450
  search_queries = queries.split('\n') if queries else ["topic:devsecops", "topic:security"]
451
 
452
  for query in search_queries:
453
+ logger.info(f"Recherche de repositories pour: '{query}' sur {num_pages} page(s)")
 
 
 
 
 
454
 
455
+ for page_number in range(1, num_pages + 1):
456
+ logger.info(f"Consultation de la page {page_number}...")
457
+ search_url = f"{base_url}/search/repositories"
 
458
 
459
+ params = {
460
+ "q": query,
461
+ "sort": "stars",
462
+ "per_page": results_per_page,
463
+ "page": page_number
464
+ }
465
 
466
+ response = make_request(search_url, headers=headers, params=params)
467
+ if not response:
468
+ break
469
+
470
+ data = response.json()
471
+ items = data.get("items", [])
472
+
473
+ if not items:
474
+ logger.info(f"Fin des résultats pour cette requête (page {page_number}).")
475
+ break
476
+
477
+ for repo in items:
478
+ repo_name = repo["full_name"].replace("/", "_")
479
+ logger.info(f"Traitement du repository: {repo['full_name']}")
480
+
481
+ issues_url = f"{base_url}/repos/{repo['full_name']}/issues"
482
+ issues_params = {"state": "closed", "labels": "security,bug,vulnerability", "per_page": 10}
483
+ issues_response = make_request(issues_url, headers=headers, params=issues_params)
484
+
485
+ if issues_response:
486
+ issues_data = issues_response.json()
487
+ for issue in issues_data:
488
+ if "pull_request" in issue: continue
489
+ question = issue.get("title", "")
490
+ body = clean_html(issue.get("body", ""))
491
+ if not question or not body or len(body) < 50: continue
492
+
493
+ comments_url = issue.get("comments_url")
494
+ comments_response = make_request(comments_url, headers=headers)
495
+ answer_parts = []
496
+ if comments_response:
497
+ comments_data = comments_response.json()
498
+ for comment in comments_data:
499
+ comment_body = clean_html(comment.get("body", ""))
500
+ if comment_body: answer_parts.append(comment_body)
501
+
502
+ if answer_parts:
503
+ answer = "\n\n".join(answer_parts)
504
+ save_qa_pair(
505
+ question=f"{question}: {body}", answer=answer, category="devsecops",
506
+ subcategory="github", source=f"github_{repo_name}"
507
+ )
508
+ time.sleep(random.uniform(1, 3))
509
  logger.info("Collecte des données GitHub terminée.")
510
 
511
+ def collect_huggingface_data(queries, num_pages, results_per_page):
512
  logger.info("Début de la collecte des données Hugging Face...")
513
  base_url = "https://huggingface.co/api"
514
  headers = {"Accept": "application/json"}
515
+
516
+ hf_token = os.getenv('HUGGINGFACE_API_TOKEN')
517
+ if hf_token:
518
+ headers["Authorization"] = f"Bearer {hf_token}"
519
+ else:
520
+ logger.warning("Clé Hugging Face non configurée. La collecte Hugging Face est ignorée.")
521
+ st.session_state.logs.append("ATTENTION: Clé Hugging Face non configurée. Collecte ignorée.")
522
+ return
523
 
524
  search_queries = queries.split('\n') if queries else ["security", "devsecops"]
525
  for query in search_queries:
526
  logger.info(f"Recherche de datasets pour: {query}")
 
 
 
 
527
 
528
+ # Hugging Face API ne supporte pas la pagination par page_number, mais par 'limit' et 'offset'
529
+ # On va simuler la pagination en ajustant l'offset
530
+ for page_number in range(num_pages):
531
+ offset = page_number * results_per_page
532
+ search_url = f"{base_url}/datasets"
533
+ params = {"search": query, "limit": results_per_page, "offset": offset}
534
 
535
+ response = make_request(search_url, headers=headers, params=params)
536
+ if not response: continue
537
+
538
+ data = response.json()
539
+ if not data:
540
+ logger.info(f"Fin des résultats pour la requête '{query}'.")
541
+ break
542
 
543
+ for dataset in data:
544
+ dataset_id = dataset["id"].replace("/", "_")
545
+ logger.info(f"Traitement du dataset: {dataset['id']}")
546
+ dataset_url = f"{base_url}/datasets/{dataset['id']}"
547
+ dataset_response = make_request(dataset_url, headers=headers)
548
+
549
+ if dataset_response:
550
+ dataset_data = dataset_response.json()
551
+ description = clean_html(dataset_data.get("description", ""))
552
+ if not description or len(description) < 100: continue
553
+ tags = dataset_data.get("tags", [])
554
+ tags_text = ", ".join(tags) if tags else "No tags"
555
+ answer = f"Dataset: {dataset_data.get('id', '')}\nDownloads: {dataset_data.get('downloads', 0)}\nTags: {tags_text}\n\n{description}"
556
+
557
+ save_qa_pair(
558
+ question=f"What is the {dataset_data.get('id', '')} dataset about?", answer=answer,
559
+ category="security", subcategory="dataset", source=f"huggingface_{dataset_id}", tags=tags
560
+ )
561
+ time.sleep(random.uniform(1, 3))
562
  logger.info("Collecte des données Hugging Face terminée.")
563
 
564
+ def collect_nvd_data(queries, num_pages, results_per_page):
565
  logger.info("Début de la collecte des données NVD...")
566
  base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
567
  headers = {"Accept": "application/json"}
 
 
 
 
 
 
 
 
 
568
 
569
+ nvd_key = os.getenv('NVD_API_KEY')
570
+ if nvd_key:
571
+ headers["apiKey"] = nvd_key
572
+ else:
573
+ logger.warning("Clé NVD non configurée. La collecte NVD est ignorée.")
574
+ st.session_state.logs.append("ATTENTION: Clé NVD non configurée. Collecte ignorée.")
575
+ return
576
 
577
+ for page in range(num_pages):
578
+ start_index = page * results_per_page
579
+ logger.info(f"Consultation de la page NVD, index de départ: {start_index}")
580
+ params = {"resultsPerPage": results_per_page, "startIndex": start_index}
581
+ response = make_request(base_url, headers=headers, params=params)
 
 
 
 
 
 
582
 
583
+ if not response:
584
+ logger.warning("Impossible de récupérer les données du NVD. Arrêt de la collecte NVD.")
585
+ break
586
+
587
+ data = response.json()
588
+ vulnerabilities = data.get("vulnerabilities", [])
589
+ if not vulnerabilities:
590
+ logger.info("Fin des résultats pour la collecte NVD.")
591
+ break
592
+
593
+ logger.info(f"Traitement de {len(vulnerabilities)} vulnérabilités...")
594
+
595
+ for vuln in vulnerabilities:
596
+ cve_data = vuln.get("cve", {})
597
+ cve_id = cve_data.get("id", "")
598
+ descriptions = cve_data.get("descriptions", [])
599
+ description = next((desc.get("value", "") for desc in descriptions if desc.get("lang") == "en"), "")
600
+ if not description or len(description) < 50: continue
601
+
602
+ cvss_v3 = cve_data.get("metrics", {}).get("cvssMetricV31", [{}])[0].get("cvssData", {})
603
+ severity = cvss_v3.get("baseSeverity", "UNKNOWN")
604
+ score = cvss_v3.get("baseScore", 0)
605
+ references = [ref.get("url", "") for ref in cve_data.get("references", [])]
606
+
607
+ answer = f"CVE ID: {cve_id}\nSeverity: {severity}\nCVSS Score: {score}\nReferences: {', '.join(references[:5])}\n\nDescription: {description}"
608
+
609
+ save_qa_pair(
610
+ question=f"What is the vulnerability {cve_id}?", answer=answer,
611
+ category="security", subcategory="vulnerability", source=f"nvd_{cve_id}"
612
+ )
613
+ time.sleep(random.uniform(1, 3))
614
  logger.info("Collecte des données NVD terminée.")
615
 
616
+ def collect_stack_exchange_data(queries, num_pages, results_per_page):
617
  logger.info("Début de la collecte des données Stack Exchange...")
618
  base_url = "https://api.stackexchange.com/2.3"
619
+ params_base = {"pagesize": results_per_page, "order": "desc", "sort": "votes", "filter": "withbody"}
620
+
621
+ se_key = os.getenv('STACK_EXCHANGE_API_KEY')
622
+ if se_key:
623
+ params_base["key"] = se_key
624
+ else:
625
+ logger.warning("Clé Stack Exchange non configurée. La collecte est ignorée.")
626
+ st.session_state.logs.append("ATTENTION: Clé Stack Exchange non configurée. Collecte ignorée.")
627
+ return
628
 
629
  sites = [
630
  {"site": "security", "category": "security", "subcategory": "security"},
 
644
  tags = tags_by_site.get(site, []) + (queries.split('\n') if queries else [])
645
 
646
  for tag in list(set(tags)):
647
+ logger.info(f"Recherche de questions avec le tag: '{tag}'")
648
  questions_url = f"{base_url}/questions"
 
 
 
 
649
 
650
+ for page_number in range(1, num_pages + 1):
651
+ params = {**params_base, "site": site, "tagged": tag, "page": page_number}
 
 
 
 
652
 
653
+ response = make_request(questions_url, params=params)
654
+ if not response: continue
 
 
 
 
655
 
656
+ questions_data = response.json()
657
+ items = questions_data.get("items", [])
658
+
659
+ if not items:
660
+ logger.info(f"Fin des résultats pour le tag '{tag}' à la page {page_number}.")
661
+ break
662
+
663
+ for question in items:
664
+ question_id = question.get("question_id")
665
+ title = question.get("title", "")
666
+ body = clean_html(question.get("body", ""))
667
+ if not body or len(body) < 50: continue
668
+
669
+ answers_url = f"{base_url}/questions/{question_id}/answers"
670
+ answers_params = {**params_base, "site": site}
671
+ answers_response = make_request(answers_url, params=answers_params)
672
+ answer_body = ""
673
+ if answers_response and answers_response.json().get("items"):
674
+ answer_body = clean_html(answers_response.json()["items"][0].get("body", ""))
675
+
676
+ if answer_body:
677
+ save_qa_pair(
678
+ question=title, answer=answer_body, category=category,
679
+ subcategory=subcategory, source=f"{site}_{question_id}", tags=question.get("tags", [])
680
+ )
681
+ time.sleep(random.uniform(1, 3))
682
  logger.info("Collecte des données Stack Exchange terminée.")
683
 
684
+ def run_data_collection(sources, queries, num_pages, results_per_page):
685
  st.session_state.bot_status = "En cours d'exécution"
686
  st.session_state.logs = []
687
 
688
+ valid_keys = check_api_keys()
689
 
690
  progress_bar = st.progress(0)
691
  status_text = st.empty()
 
695
  completed_sources = 0
696
 
697
  for source_name in enabled_sources:
698
+ if source_name == "Kaggle" and 'KAGGLE_USERNAME' not in os.environ:
699
+ logger.warning("Clés Kaggle non définies dans les variables d'environnement. Saut de la collecte Kaggle.")
700
+ continue
701
+ if source_name == "GitHub" and not valid_keys.get('GITHUB_API_TOKEN'):
702
+ logger.warning("Clé GitHub non définie. Saut de la collecte GitHub.")
703
+ continue
704
+ if source_name == "Hugging Face" and not valid_keys.get('HUGGINGFACE_API_TOKEN'):
705
+ logger.warning("Clé Hugging Face non définie. Saut de la collecte Hugging Face.")
706
+ continue
707
+ if source_name == "NVD" and not valid_keys.get('NVD_API_KEY'):
708
+ logger.warning("Clé NVD non définie. Saut de la collecte NVD.")
709
+ continue
710
+ if source_name == "Stack Exchange" and not valid_keys.get('STACK_EXCHANGE_API_KEY'):
711
+ logger.warning("Clé Stack Exchange non définie. Saut de la collecte Stack Exchange.")
712
+ continue
713
+
714
  status_text.text(f"Collecte des données de {source_name}...")
715
  try:
716
  if source_name == "Kaggle":
717
+ collect_kaggle_data(queries.get("Kaggle", ""), num_pages, results_per_page)
718
  elif source_name == "GitHub":
719
+ collect_github_data(queries.get("GitHub", ""), num_pages, results_per_page)
720
  elif source_name == "Hugging Face":
721
+ collect_huggingface_data(queries.get("Hugging Face", ""), num_pages, results_per_page)
722
  elif source_name == "NVD":
723
+ collect_nvd_data(queries.get("NVD", ""), num_pages, results_per_page)
724
  elif source_name == "Stack Exchange":
725
+ collect_stack_exchange_data(queries.get("Stack Exchange", ""), num_pages, results_per_page)
726
  except Exception as e:
727
  logger.error(f"Erreur fatale lors de la collecte de {source_name}: {str(e)}")
728
 
 
734
  progress_bar.empty()
735
  status_text.empty()
736
 
 
737
  st.rerun()
738
 
739
  def main():
 
757
 
758
  st.markdown("---")
759
 
760
+ st.header("Paramètres de la collecte")
761
+ col1, col2 = st.columns(2)
762
+ num_pages = col1.slider(
763
+ "Nombre de pages à consulter par source",
764
+ min_value=1, max_value=20, value=5,
765
+ help="Le bot consultera jusqu'à X pages de résultats pour chaque source."
766
+ )
767
+ results_per_page = col2.slider(
768
+ "Nombre de résultats par page",
769
+ min_value=10, max_value=100, value=20,
770
+ help="Le bot demandera jusqu'à Y résultats pour chaque page consultée."
771
+ )
772
+
773
  st.header("Lancer la collecte")
774
 
775
  st.subheader("Sources de données")
 
787
  queries["GitHub"] = st.text_area("Requêtes GitHub (une par ligne)", "topic:devsecops\ntopic:security\nvulnerability")
788
  queries["Kaggle"] = st.text_area("Requêtes Kaggle (une par ligne)", "cybersecurity\nvulnerability dataset\npenetration testing")
789
  queries["Hugging Face"] = st.text_area("Requêtes Hugging Face (une par ligne)", "security dataset\nvulnerability\nlanguage model security")
790
+ queries["NVD"] = ""
791
  queries["Stack Exchange"] = st.text_area("Tags Stack Exchange (un par ligne)", "devsecops\nsecurity\nvulnerability")
792
 
793
  st.markdown("---")
 
797
  st.session_state.logs = []
798
  st.session_state.qa_data = []
799
  st.session_state.total_qa_pairs = 0
800
+ run_data_collection(sources, queries, num_pages, results_per_page)
801
  else:
802
  st.warning("La collecte est en cours. Veuillez attendre qu'elle se termine.")
803
  if st.button("Forcer l'arrêt", use_container_width=True, type="secondary"):
 
878
  st.session_state.n_predict = st.slider("Nombre de tokens", 128, 1024, st.session_state.n_predict, help="Nombre maximum de tokens à générer par l'IA.")
879
 
880
  if __name__ == "__main__":
881
+ main()