-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript7.py
More file actions
185 lines (162 loc) · 7.61 KB
/
Copy pathscript7.py
File metadata and controls
185 lines (162 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
from bs4.element import Tag
from random import randint
import mysql.connector
import csv
import time
def run(query, pages):
connection = mysql.connector.connect(
user='your_user',
password='your_password',
host='localhost',
database='your_database'
)
cursor = connection.cursor()
options = uc.ChromeOptions()
options.add_argument("--incognito")
driver = uc.Chrome(options=options)
url = "https://www.carousell.sg/"
base_url = "https://www.carousell.sg"
search_query = query
clicks = pages
query_words = search_query.lower().split()
# Go to URL
driver.get(url)
time.sleep(randint(2,5))
# Find and input query
search_box = driver.find_element(By.XPATH, "//input[@placeholder='Search for anything and everything']")
search_box.send_keys(search_query)
time.sleep(randint(1,3))
# Click the search button
search_button = driver.find_element(By.XPATH, "//button[@type='submit']")
search_button.click()
time.sleep(randint(2,4))
# Wait for page to load
wait = WebDriverWait(driver, 10)
# Check for marketplace
try:
# Check if search results element is present
search_results = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, f"//h1[contains(text(), 'search results for') and contains(text(), '{search_query}')]")))
except TimeoutException:
# If search results are not present, try to click the Marketplace button
try:
marketplace_paragraph = driver.find_element(By.XPATH, "//p[@title='Marketplace']")
marketplace_button = marketplace_paragraph.find_element(By.XPATH, "./..")
marketplace_button.click()
time.sleep(randint(3,5))
except NoSuchElementException:
print("Marketplace button not found")
# Number of pages user wants to load
num_clicks = int(clicks)
for _ in range(num_clicks):
try:
time.sleep(randint(3,5))
# Find button and click it
button = driver.find_element(By.XPATH, "//button[text()='Show more results']")
button.click()
time.sleep(randint(8,10))
except NoSuchElementException:
print("No more results")
break
# Get page source and write to a file
# page_source = driver.page_source
# with open('page_source.txt', 'w', encoding='utf-8') as f:
# f.write(page_source)
# Scrape with beautifulsoup
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# Scrape anything in the page that has the search query words
elements_names = [p for p in soup.find_all('p') if any(word in p.text.lower() for word in query_words)]
product_names = [element.text for element in elements_names]
# Scrape where the prices should be (if the price is missing, the 'product' won't be added to database)
elements_prices = []
for p in elements_names:
try:
price_div = p.find_next_sibling('div')
if price_div:
price_p = price_div.find('p')
if price_p and 'S$' in price_p.text:
elements_prices.append(price_p)
else:
elements_prices.append('')
print(f"Could not find price p tag for product {p.text}")
else:
elements_prices.append('')
print(f"Could not find price div for product {p.text}")
except AttributeError:
elements_prices.append('')
print(f"(AttributeError) Could not find price div for product {p.text}")
product_prices = [element.text if isinstance(element, Tag) else element for element in elements_prices]
# Scrape where the seller's username should be (if the username is missing, the 'product' won't be added to database)
elements_sellers = []
for p in elements_names:
try:
# Find the parent 'div' tag of the product name 'p' tag
parent_div = p.find_parent('div')
# Find the 'p' tag with data-testid="listing-card-text-seller-name" within the 'div' tag
if parent_div:
seller_p = parent_div.find('p', attrs={"data-testid": "listing-card-text-seller-name"})
if seller_p:
elements_sellers.append(seller_p)
else:
elements_sellers.append('')
print(f"Could not find seller p tag for product {p.text}")
else:
elements_sellers.append('')
print(f"Could not find parent div for product {p.text}")
except AttributeError:
elements_sellers.append('')
print(f"(AttributeError) Could not find parent div for product {p.text}")
product_sellers = [element.text if isinstance(element, Tag) else element for element in elements_sellers]
# Scrape where the links should be (if the link is missing, the 'product' won't be added to database)
elements_links = []
for p in elements_names:
try:
link_div = p.find_parent('a', href=True)
if link_div and link_div['href'].startswith('/p/'):
elements_links.append(base_url + link_div['href'])
else:
elements_links.append('')
print(f"Could not find link div for product {p.text}")
except AttributeError:
elements_links.append('')
print(f"(AttributeError) Could not find link div for product {p.text}")
product_links = [base_url + element['href'] if isinstance(element, Tag) else element for element in elements_links]
# Print product names, prices, and seller
for name, price, seller in zip(product_names, product_prices, product_sellers):
if price != '':
print(name, price, seller)
# Store product names, prices, sellers, and links in a .csv file
# with open('products.csv', 'w', newline='', encoding='utf-8') as file:
# writer = csv.writer(file)
# writer.writerow(["Name", "Price", "Seller", "Link"])
# for name, price, seller, link in zip(product_names, product_prices, product_sellers, product_links):
# if price != '':
# writer.writerow([name, price, seller, link])
# Create the table name
table_name = search_query.replace(" ", "_")
# Check if table exists, if not, create new table
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} "
"(id INT AUTO_INCREMENT, name VARCHAR(255), price DECIMAL(10, 2), seller VARCHAR(255), link VARCHAR(1024), date_time DATETIME, PRIMARY KEY (id))")
# Store product names and prices in MySQL database
for name, price, seller, link in zip(product_names, product_prices, product_sellers, product_links):
if price != '':
price = price.replace('S$', '').replace(',', '')
price = float(price)
add_product = (f"INSERT INTO {table_name} "
"(name, price, seller, link, date_time) "
"VALUES (%s, %s, %s, %s, NOW())")
data_product = (name, price, seller, link)
cursor.execute(add_product, data_product)
connection.commit()
cursor.close()
connection.close()
time.sleep(10)
driver.close()
driver.quit()