最初のコードブロック
%%shell
cat > /etc/apt/sources.list.d/debian.list <<‘EOF’
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys DCC9EFBF77E11517
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys 648ACFD622F3D138
apt-key adv –keyserver keyserver.ubuntu.com –recv-keys 112695A0E562B32A
apt-key export 77E11517 | gpg –dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg –dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg –dearmour -o /usr/share/keyrings/debian-security-buster.gpg
cat > /etc/apt/preferences.d/chromium.pref << ‘EOF’
Package: *
Pin: release a=eoan
Pin-Priority: 500
Package: *
Pin: origin “deb.debian.org”
Pin-Priority: 300
Package: chromium*
Pin: origin “deb.debian.org”
Pin-Priority: 700
EOF
次ブロック
!apt-get update
!apt-get install chromium chromium-driver
!pip install -q selenium
確認
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
try:
# ブラウザをheadlessモード実行
options = webdriver.ChromeOptions()
#ヘッドレスモード(バックグラウンドで起動)で実行。コラボの場合、必須。
options.add_argument(‘–headless’)
#サンドボックスモードの解除。これも必須。
options.add_argument(‘–no-sandbox’)
#これも設定した方がよい。
options.add_argument(‘–disable-dev-shm-usage’)
#インスタンス化
driver = webdriver.Chrome(‘chromedriver’, options=options)
#指定したドライバーが見つかるまで待機
driver.implicitly_wait(10)
base_url = “https://www.amazon.co.jp/gp/new-releases/books/466298/ref=zg_bsnr_unv_books_2_492350_1”
driver.get(base_url)
time.sleep(5)
# scroll
while len(driver.find_elements(By.ID, “gridItemRoot”)) < 50:
time.sleep(5)
driver.execute_script(“window.scrollBy(0, 3000);”)
# レビューごとの要素
prosuct_elements = driver.find_elements(By.ID, “gridItemRoot”)
results = list() for i_section in prosuct_elements:
result_row = dict()
# url a_element = i_section.find_element(By.CSS_SELECTOR, “div > div:nth-child(2) > div > a:nth-child(2)”)
result_row[“url”] = a_element.get_attribute(“href”)
# 製品名
result_row[“name”] = a_element.text
print(result_row)
results.append(result_row)
finally:
driver.quit()
df = pd.DataFrame(results)
print(df.head())