fix
Browse files- scrape/trendyol_scraper.py +75 -45
scrape/trendyol_scraper.py
CHANGED
|
@@ -3,108 +3,138 @@ from selenium.webdriver.chrome.service import Service
|
|
| 3 |
from selenium.webdriver.common.by import By
|
| 4 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 5 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
| 6 |
import time
|
| 7 |
import pandas as pd
|
| 8 |
import os
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def comprehensive_scroll(driver):
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
last_height = new_height
|
| 26 |
|
| 27 |
def scrape_reviews(url):
|
| 28 |
"""URL'den yorumları çeken fonksiyon"""
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
try:
|
| 43 |
# Linux için ChromeDriver ayarı
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
|
|
|
| 47 |
driver.get(url)
|
| 48 |
|
| 49 |
# Çerez popup'ını kabul et
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
|
|
|
| 54 |
comprehensive_scroll(driver)
|
| 55 |
|
|
|
|
| 56 |
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
|
| 57 |
total_comments = len(comment_elements)
|
|
|
|
| 58 |
|
| 59 |
data = []
|
| 60 |
for i in range(1, total_comments + 1):
|
| 61 |
-
kullanıcı_id = i
|
| 62 |
try:
|
| 63 |
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
|
| 64 |
username = driver.find_element(By.XPATH, username_xpath).text
|
| 65 |
-
except:
|
| 66 |
username = "N/A"
|
| 67 |
|
| 68 |
try:
|
| 69 |
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
|
| 70 |
comment = driver.find_element(By.XPATH, comment_xpath).text
|
| 71 |
-
except:
|
| 72 |
comment = "N/A"
|
| 73 |
|
| 74 |
try:
|
| 75 |
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
|
| 76 |
date = driver.find_element(By.XPATH, date_xpath).text
|
| 77 |
-
except:
|
| 78 |
date = "N/A"
|
| 79 |
|
| 80 |
-
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
|
| 81 |
try:
|
|
|
|
| 82 |
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
| 83 |
star_count = len(full_stars)
|
| 84 |
-
except:
|
| 85 |
star_count = 0
|
| 86 |
|
| 87 |
data.append({
|
| 88 |
-
"Kullanıcı_id":
|
| 89 |
"Kullanıcı Adı": username,
|
| 90 |
"Yorum": comment,
|
| 91 |
"Tarih": date,
|
| 92 |
"Yıldız Sayısı": star_count
|
| 93 |
})
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# Geçici dosya olarak kaydet
|
| 96 |
temp_file = os.path.join(data_directory, 'temp_comments.csv')
|
| 97 |
-
df = pd.DataFrame(data)
|
| 98 |
df.to_csv(temp_file, index=False, encoding='utf-8-sig')
|
|
|
|
| 99 |
|
| 100 |
return df
|
| 101 |
|
| 102 |
except Exception as e:
|
| 103 |
-
|
| 104 |
-
return pd.DataFrame()
|
| 105 |
|
| 106 |
finally:
|
| 107 |
-
driver
|
|
|
|
|
|
|
| 108 |
# Geçici dosyayı sil
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
| 3 |
from selenium.webdriver.common.by import By
|
| 4 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 5 |
from selenium.webdriver.support import expected_conditions as EC
|
| 6 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
| 7 |
import time
|
| 8 |
import pandas as pd
|
| 9 |
import os
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
def comprehensive_scroll(driver):
|
| 15 |
+
"""Scroll until no more new content is loaded"""
|
| 16 |
+
try:
|
| 17 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
| 18 |
+
while True:
|
| 19 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 20 |
+
time.sleep(3)
|
| 21 |
+
|
| 22 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
| 23 |
+
if new_height == last_height:
|
| 24 |
+
break
|
| 25 |
+
last_height = new_height
|
| 26 |
+
except Exception as e:
|
| 27 |
+
logger.error(f"Scroll sırasında hata: {str(e)}")
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def scrape_reviews(url):
|
| 30 |
"""URL'den yorumları çeken fonksiyon"""
|
| 31 |
+
driver = None
|
| 32 |
+
try:
|
| 33 |
+
# Data directory oluştur
|
| 34 |
+
data_directory = "data"
|
| 35 |
+
if not os.path.exists(data_directory):
|
| 36 |
+
os.makedirs(data_directory)
|
| 37 |
|
| 38 |
+
# Chrome options ayarları
|
| 39 |
+
chrome_options = webdriver.ChromeOptions()
|
| 40 |
+
chrome_options.add_argument('--headless')
|
| 41 |
+
chrome_options.add_argument('--disable-gpu')
|
| 42 |
+
chrome_options.add_argument('--no-sandbox')
|
| 43 |
+
chrome_options.add_argument('--disable-dev-shm-usage')
|
| 44 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
| 45 |
|
|
|
|
| 46 |
# Linux için ChromeDriver ayarı
|
| 47 |
+
try:
|
| 48 |
+
# Önce /usr/local/bin/chromedriver'ı dene
|
| 49 |
+
service = Service('/usr/local/bin/chromedriver')
|
| 50 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 51 |
+
except:
|
| 52 |
+
try:
|
| 53 |
+
# Eğer başarısız olursa /usr/bin/chromedriver'ı dene
|
| 54 |
+
service = Service('/usr/bin/chromedriver')
|
| 55 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 56 |
+
except:
|
| 57 |
+
# Son olarak PATH'teki chromedriver'ı dene
|
| 58 |
+
service = Service('chromedriver')
|
| 59 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 60 |
|
| 61 |
+
logger.info(f"URL'ye erişiliyor: {url}")
|
| 62 |
driver.get(url)
|
| 63 |
|
| 64 |
# Çerez popup'ını kabul et
|
| 65 |
+
try:
|
| 66 |
+
WebDriverWait(driver, 10).until(
|
| 67 |
+
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
| 68 |
+
).click()
|
| 69 |
+
logger.info("Çerez popup'ı kabul edildi")
|
| 70 |
+
except TimeoutException:
|
| 71 |
+
logger.warning("Çerez popup'ı bulunamadı veya tıklanamadı")
|
| 72 |
|
| 73 |
+
logger.info("Sayfa kaydırılıyor...")
|
| 74 |
comprehensive_scroll(driver)
|
| 75 |
|
| 76 |
+
logger.info("Yorumlar toplanıyor...")
|
| 77 |
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
|
| 78 |
total_comments = len(comment_elements)
|
| 79 |
+
logger.info(f"Toplam {total_comments} yorum bulundu")
|
| 80 |
|
| 81 |
data = []
|
| 82 |
for i in range(1, total_comments + 1):
|
|
|
|
| 83 |
try:
|
| 84 |
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
|
| 85 |
username = driver.find_element(By.XPATH, username_xpath).text
|
| 86 |
+
except NoSuchElementException:
|
| 87 |
username = "N/A"
|
| 88 |
|
| 89 |
try:
|
| 90 |
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
|
| 91 |
comment = driver.find_element(By.XPATH, comment_xpath).text
|
| 92 |
+
except NoSuchElementException:
|
| 93 |
comment = "N/A"
|
| 94 |
|
| 95 |
try:
|
| 96 |
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
|
| 97 |
date = driver.find_element(By.XPATH, date_xpath).text
|
| 98 |
+
except NoSuchElementException:
|
| 99 |
date = "N/A"
|
| 100 |
|
|
|
|
| 101 |
try:
|
| 102 |
+
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
|
| 103 |
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
| 104 |
star_count = len(full_stars)
|
| 105 |
+
except NoSuchElementException:
|
| 106 |
star_count = 0
|
| 107 |
|
| 108 |
data.append({
|
| 109 |
+
"Kullanıcı_id": i,
|
| 110 |
"Kullanıcı Adı": username,
|
| 111 |
"Yorum": comment,
|
| 112 |
"Tarih": date,
|
| 113 |
"Yıldız Sayısı": star_count
|
| 114 |
})
|
| 115 |
|
| 116 |
+
if i % 10 == 0:
|
| 117 |
+
logger.info(f"{i}/{total_comments} yorum toplandı")
|
| 118 |
+
|
| 119 |
+
df = pd.DataFrame(data)
|
| 120 |
+
|
| 121 |
# Geçici dosya olarak kaydet
|
| 122 |
temp_file = os.path.join(data_directory, 'temp_comments.csv')
|
|
|
|
| 123 |
df.to_csv(temp_file, index=False, encoding='utf-8-sig')
|
| 124 |
+
logger.info(f"Veriler {temp_file} dosyasına kaydedildi")
|
| 125 |
|
| 126 |
return df
|
| 127 |
|
| 128 |
except Exception as e:
|
| 129 |
+
logger.error(f"Veri çekme sırasında hata: {str(e)}")
|
| 130 |
+
return pd.DataFrame()
|
| 131 |
|
| 132 |
finally:
|
| 133 |
+
if driver:
|
| 134 |
+
driver.quit()
|
| 135 |
+
logger.info("Chrome driver kapatıldı")
|
| 136 |
# Geçici dosyayı sil
|
| 137 |
+
temp_file = os.path.join("data", 'temp_comments.csv')
|
| 138 |
+
if os.path.exists(temp_file):
|
| 139 |
+
os.remove(temp_file)
|
| 140 |
+
logger.info("Geçici dosya silindi")
|