めうの雑記

備忘録

pythonで価格.comから情報を取得する

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib.parse
import urllib.request as req
import re

f = open("result.txt", "w")

page_list = [1,2,3]

def scraping(page_count):

    print(str(page_count) + "ページ目")
    
    url = "https://kakaku.com/search_results/%97%E2%91%A0%8C%C9/?page=" + str(page_count)

    res = req.urlopen(url)
    soup = BeautifulSoup(res, "html.parser")
    
    #headとjavascript部分を削除
    [s.decompose() for s in soup('script')]
    [s.decompose() for s in soup('head')]

    url_parts = soup.find_all("a")
    url_list = []
    for url in url_parts:
        if "/?lid=pc_ksearch_kakakuitem" in url["href"]:
            url_list.append(url["href"])

    soup = soup.text.replace("\n","")
    html_list = soup.split()
    #15~18
    del html_list[:18]

    count = 0
    for ele in html_list:
        if ele == "※":
            break
        elif "発売日" in ele:
            f.write(url_list[count] + "\n")
            f.write(ele + '\n') 
            count += 2
        else:
            f.write(ele + '\n')
    
    if page_count < max(page_list):
        choices = input("続行しますか?(yes or other key)>>")
        if not choices == "yes":
            return

for i in page_list:
    scraping(i)

f.close()


参考にした書籍