04. 이미지 수집

1. Pixabay

이미지 URL 추출해서 저장하기

import chromedriver_autoinstaller

import time

from selenium import webdriver

from urllib.request import Request, urlopen

driver = webdriver.Chrome()

url = 'https://pixabay.com/ko/images/search/강아지/'

driver.get(url)

# 이미지 주소 가져오기

image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div/div[1]/div/a/img'

image_url = driver.find_element('xpath', image_xpath).get_attribute('src')

# 이미지 파일 저장

image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})

f = open('dog.jpg', 'wb')

f.write(urlopen(image_byte).read())

f.close()

* 해당 폴더에 저장됨

여러개 이미지 불러오기

# selenium 패키지에서 webdriver 모듈을 가져오기

from selenium import webdriver

from selenium.webdriver.common.by import By

# Chrome 웹드라이버 객체를 생성, 웹 페이지 URL을 설정 후 열기

driver = webdriver.Chrome()

url = 'https://pixabay.com/ko/images/search/강아지/'

driver.get(url)

driver.implicitly_wait(3) # 페이지가 로드될 때까지 최대 3초 동안 대기

# 이미지 영역의 XPath를 설정 후, 영역 찾기

image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'

image_area = driver.find_element(By.XPATH, image_area_xpath)

# 이미지 영역에서 모든 'img' 태그 요소들을 찾기

image_elements = image_area.find_elements(By.TAG_NAME, 'img')

# 찾은 이미지 요소들 각각에 'src' 속성 값을 가져와 출력

for image_element in image_elements:

print(image_element.get_attribute('src'))

https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg
https://cdn.pixabay.com/photo/2024/03/14/08/52/pug-8632718_640.jpg
https://cdn.pixabay.com/photo/2016/12/13/05/15/puppy-1903313_640.jpg
https://cdn.pixabay.com/photo/2022/06/18/16/55/cute-7270285_640.png
https://cdn.pixabay.com/photo/2019/04/10/23/51/animal-4118585_640.jpg
https://cdn.pixabay.com/photo/2019/07/23/13/51/shepherd-dog-4357790_640.jpg
https://cdn.pixabay.com/photo/2016/07/15/15/55/dachshund-1519374_640.jpg
https://cdn.pixabay.com/photo/2018/01/09/11/04/dog-3071334_640.jpg
https://cdn.pixabay.com/photo/2020/10/03/11/08/girl-5623231_640.jpg
https://cdn.pixabay.com/photo/2023/11/10/17/10/jack-russell-8379770_640.jpg
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
.
.
.
.
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif

ㄴ 빈 이미지 태그도 같이 떠서 오류를 수정해야 함

이미지 스크롤 내려서 더 많이 가져오기

# selenium 패키지에서 webdriver 모듈을 가져오기

from selenium import webdriver

from selenium.webdriver.common.by import By

import time

driver = webdriver.Chrome()

url = 'https://pixabay.com/ko/images/search/강아지/'

driver.get(url)

driver.implicitly_wait(3)

image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'

image_area = driver.find_element(By.XPATH, image_area_xpath)

image_elements = image_area.find_elements(By.TAG_NAME, 'img')

# 이미지 URL을 저장할 리스트를 초기화

image_urls = []

# 찾은 이미지 요소들 각각에 대해 반복문을 실행

for image_element in image_elements:

# 'data-lazy-src' 속성 값을 가져오고 없으면 'src' 속성 값을 가져오기

image_url = image_element.get_attribute('data-lazy-src')

if image_url is None:

image_url = image_element.get_attribute('src')

print(image_url)

# 이미지 URL을 리스트에 추가

image_urls.append(image_url)

더 많은 이미지가 출력됨

이미지 저장하기

import os

from urllib import parse

from urllib.request import Request, urlopen

# 이미지 URL 리스트에서 각 이미지를 다운로드

for i in range(len(image_urls)):

image_url = image_urls[i] # 현재 이미지 URL을 가져오기

url = parse.urlparse(image_url) # URL을 파싱하여 파일 경로를 추출

name, ext = os.path.splitext(url.path) # 파일 경로에서 파일명과 확장자를 분리

# 이미지 URL에 요청을 보낼 때 사용할 헤더를 설정

image_request = Request(image_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})

# 파일을 쓰기 모드로 열고 데이터를 쓰기

with open(f'dog{i}{ext}', 'wb') as f:

f.write(urlopen(image_request).read())

다중페이지 출력 : 파일제목(페이지번호 부여)

def crawl_and_save_image(keyword, pages):

image_urls = []

for page in range(1, pages+1):

url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={page}'

driver.get(url)

time.sleep(3)

for _ in range(20):

driver.execute_script('window.scrollTo(0, document.body.scrollHeight / 2)')

time.sleep(0.3)

image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'

image_area = driver.find_element(By.XPATH, image_area_xpath)

image_elements = image_area.find_elements(By.TAG_NAME, 'img')

for image_element in image_elements:

image_url = image_element.get_attribute('src')

print(image_url)

image_urls.append(image_url)

# 폴더 있는지 확인

# 없으면 만들기

if not os.path.exists(keyword):

os.mkdir(keyword)

#https://cdn.pixabay.com/photo/2022/10/25/13/24/puppy-2785074_1280.jpg

for i in range(len(image_urls)):

image_url = image_urls[i]

url = parse.urlparse(image_url)

filename = image_url.split('/')[-1]

image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'})

f = open(f'./{keyword}/{filename}', 'wb')

f.write(urlopen(image_byte).read())

f.close()

driver = webdriver.Chrome()

crawl_and_save_image('호랑이', 2)

https://cdn.pixabay.com/photo/2018/01/25/14/12/nature-3106213_1280.jpg
https://cdn.pixabay.com/photo/2017/11/06/09/53/tiger-2923186_640.jpg
https://cdn.pixabay.com/photo/2018/03/26/20/49/tiger-3264048_640.jpg
https://cdn.pixabay.com/photo/2023/12/07/19/45/tiger-8436227_640.jpg
https://cdn.pixabay.com/photo/2017/01/12/21/42/tiger-1975790_640.jpg
https://cdn.pixabay.com/photo/2016/11/29/10/07/tiger-1868911_640.jpg
https://cdn.pixabay.com/photo/2018/05/23/18/54/tiger-3424791_640.jpg
https://cdn.pixabay.com/photo/2013/07/19/00/18/tiger-165189_640.jpg
https://cdn.pixabay.com/photo/2023/03/30/18/27/animal-7888465_640.jpg
https://cdn.pixabay.com/photo/2016/07/18/20/30/tiger-1526704_640.png
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
.............
https://cdn.pixabay.com/photo/2024/05/12/09/03/ai-generated-8756430_640.jpg
https://cdn.pixabay.com/photo/2024/04/19/16/06/ai-generated-8706603_640.jpg

너구리 코드

from urllib import parse

import os

from selenium.webdriver.common.by import By

import chromedriver_autoinstaller

import time

from selenium import webdriver

from urllib.request import Request, urlopen

def crawl_and_save_image(keyword, pages):

image_urls = []

driver = webdriver.Chrome()

for page in range(1, pages+1):

url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={page}'

driver.get(url)

time.sleep(3)

image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'

image_area = driver.find_element(By.XPATH, image_area_xpath)

image_elements = image_area.find_elements(By.TAG_NAME, 'img')

for image_element in image_elements:

image_url = image_element.get_attribute('data-lazy-src')

if image_url is None:

image_url = image_element.get_attribute('src')

image_urls.append(image_url)

// 폴더 있는지 확인

// 없으면 만들기

if not ps.path.exists(keyword):

os.mkdir(keyword)

for i in range(len(image_urls)):

image_url = image_urls[i]

url = parse.urlparse(image_url)

filename = image_url.split('/')[-1]

image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'})

f = open(f'./{keyword}/{filename}', 'wb')

f.write(urlopen(image_byte).read())

f.close()

crawl_and_save_image('너구리', 3)

함수로 리팩토링

import chromedriver_autoinstaller

import time

from selenium import webdriver

from urllib.request import Request, urlopen

driver = webdriver.Chrome()

url = 'https://pixabay.com/ko/images/search/강아지/'

driver.get(url)

# 이미지 주소 가져오기

image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div/div[1]/div/a/img'

image_url = driver.find_element('xpath', image_xpath).get_attribute('src')

# 이미지 파일 저장

image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})

f = open('dog.jpg', 'wb')

f.write(urlopen(image_byte).read())

f.close()

'데이터분석 > 크롤링' 카테고리의 다른 글

03. 인스타그램 (0)	2024.05.21
02. Selenium, Xpath (네이버웹툰 크롤링) (0)	2024.05.21
01. 크롤링(Crawling) (0)	2024.05.20

leesarr-study

04. 이미지 수집

1. Pixabay

'데이터분석 > 크롤링' 카테고리의 다른 글

티스토리툴바

04. 이미지 수집

1. Pixabay

'데이터분석 > 크롤링' 카테고리의 다른 글

관련글

티스토리툴바