1 minute read

ChatGPT를 이용해서 웹크롤링 해보기

google에서 검색한 후에 이미지들을 다운받는 방법.

–> google 크롤링이 막혔는지.. API키랑 권한이 있어야 한다고 한다 ㅜㅜ

  1. naver 검색한 페이지 링크를 chatgpt 에 넣고
    image

  2. 해당 코드(클롤링후 저장해 달라는)를 chatgpt를 이용해서 python으로 생성한 후에
    image

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

# Create a directory to store the images
image_dir = "images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

# Define the URL of the Naver search page
url = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EA%B7%B8%EB%A6%BC"

# Send a GET request to the Naver search page
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find all image tags on the page
image_tags = soup.find_all("img")

# Loop through each image tag and download the image
for i, tag in enumerate(image_tags):
    # Get the image URL
    image_url = tag["src"]

    # Extract the image file name
    parsed_url = urlparse(image_url)
    query_params = parse_qs(parsed_url.query)
    file_name = query_params["fname"][0]

    # Set the file path to save the image
    file_path = os.path.join(image_dir, file_name)

    # Download the image
    response = requests.get(image_url)
    with open(file_path, "wb") as file:
        file.write(response.content)

    print(f"Downloaded image {i+1}/{len(image_tags)}")

print("All images downloaded successfully!")
  1. colab에 접속해서 jupyter notebook에서 동작을 시켜서 image
    에러가 난다. ㅜㅜ
    모.. 다시 해결해 달라고 하고..
    image
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

# Create a directory to store the images
image_dir = "images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

# Define the URL of the Naver search page
url = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EA%B7%B8%EB%A6%BC"

# Send a GET request to the Naver search page
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find all image tags on the page
image_tags = soup.find_all("img")

# Loop through each image tag and download the image
for i, tag in enumerate(image_tags):
    # Get the image URL
    image_url = tag["src"]

    # Set the file path to save the image
    file_name = f"image_{i+1}.jpg"
    file_path = os.path.join(image_dir, file_name)

    # Download the image
    response = requests.get(image_url)
    with open(file_path, "wb") as file:
        file.write(response.content)

    print(f"Downloaded image {i+1}/{len(image_tags)}")

print("All images downloaded successfully!")

  1. 이미지가 저장된것을 확인할 수 있다.
    image
    image

참고 영상 (google 크롤링은 ㅜㅜ)