1. Pixabay
- 이미지 URL 추출해서 저장하기
import chromedriver_autoinstaller
import time
from selenium import webdriver
from urllib.request import Request, urlopen
driver = webdriver.Chrome()
driver.get(url)
# 이미지 주소 가져오기
image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div/div[1]/div/a/img'
image_url = driver.find_element('xpath', image_xpath).get_attribute('src')
# 이미지 파일 저장
image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})
f = open('dog.jpg', 'wb')
f.write(urlopen(image_byte).read())
f.close()
더보기
* 해당 폴더에 저장됨
- 여러개 이미지 불러오기
# selenium 패키지에서 webdriver 모듈을 가져오기
from selenium import webdriver
from selenium.webdriver.common.by import By
# Chrome 웹드라이버 객체를 생성, 웹 페이지 URL을 설정 후 열기
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(3) # 페이지가 로드될 때까지 최대 3초 동안 대기
# 이미지 영역의 XPath를 설정 후, 영역 찾기
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
image_area = driver.find_element(By.XPATH, image_area_xpath)
# 이미지 영역에서 모든 'img' 태그 요소들을 찾기
image_elements = image_area.find_elements(By.TAG_NAME, 'img')
# 찾은 이미지 요소들 각각에 'src' 속성 값을 가져와 출력
for image_element in image_elements:
print(image_element.get_attribute('src'))
https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg
https://cdn.pixabay.com/photo/2024/03/14/08/52/pug-8632718_640.jpg
https://cdn.pixabay.com/photo/2016/12/13/05/15/puppy-1903313_640.jpg
https://cdn.pixabay.com/photo/2022/06/18/16/55/cute-7270285_640.png
https://cdn.pixabay.com/photo/2019/04/10/23/51/animal-4118585_640.jpg
https://cdn.pixabay.com/photo/2019/07/23/13/51/shepherd-dog-4357790_640.jpg
https://cdn.pixabay.com/photo/2016/07/15/15/55/dachshund-1519374_640.jpg
https://cdn.pixabay.com/photo/2018/01/09/11/04/dog-3071334_640.jpg
https://cdn.pixabay.com/photo/2020/10/03/11/08/girl-5623231_640.jpg
https://cdn.pixabay.com/photo/2023/11/10/17/10/jack-russell-8379770_640.jpg
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
.
.
.
.
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
ㄴ 빈 이미지 태그도 같이 떠서 오류를 수정해야 함
- 이미지 스크롤 내려서 더 많이 가져오기
# selenium 패키지에서 webdriver 모듈을 가져오기
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(3)
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')
# 이미지 URL을 저장할 리스트를 초기화
image_urls = []
# 찾은 이미지 요소들 각각에 대해 반복문을 실행
for image_element in image_elements:
# 'data-lazy-src' 속성 값을 가져오고 없으면 'src' 속성 값을 가져오기
image_url = image_element.get_attribute('data-lazy-src')
if image_url is None:
image_url = image_element.get_attribute('src')
print(image_url)
# 이미지 URL을 리스트에 추가
image_urls.append(image_url)
더 많은 이미지가 출력됨 |
- 이미지 저장하기
import os
from urllib import parse
from urllib.request import Request, urlopen
# 이미지 URL 리스트에서 각 이미지를 다운로드
for i in range(len(image_urls)):
image_url = image_urls[i] # 현재 이미지 URL을 가져오기
url = parse.urlparse(image_url) # URL을 파싱하여 파일 경로를 추출
name, ext = os.path.splitext(url.path) # 파일 경로에서 파일명과 확장자를 분리
# 이미지 URL에 요청을 보낼 때 사용할 헤더를 설정
image_request = Request(image_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})
# 파일을 쓰기 모드로 열고 데이터를 쓰기
with open(f'dog{i}{ext}', 'wb') as f:
f.write(urlopen(image_request).read())
- 다중페이지 출력 : 파일제목(페이지번호 부여)
def crawl_and_save_image(keyword, pages):
image_urls = []
for page in range(1, pages+1):
driver.get(url)
time.sleep(3)
for _ in range(20):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight / 2)')
time.sleep(0.3)
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')
for image_element in image_elements:
image_url = image_element.get_attribute('src')
print(image_url)
image_urls.append(image_url)
# 폴더 있는지 확인
# 없으면 만들기
if not os.path.exists(keyword):
os.mkdir(keyword)
for i in range(len(image_urls)):
image_url = image_urls[i]
url = parse.urlparse(image_url)
filename = image_url.split('/')[-1]
image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'})
f = open(f'./{keyword}/{filename}', 'wb')
f.write(urlopen(image_byte).read())
f.close()
driver = webdriver.Chrome()
crawl_and_save_image('호랑이', 2)
https://cdn.pixabay.com/photo/2018/01/25/14/12/nature-3106213_1280.jpg
https://cdn.pixabay.com/photo/2017/11/06/09/53/tiger-2923186_640.jpg
https://cdn.pixabay.com/photo/2018/03/26/20/49/tiger-3264048_640.jpg
https://cdn.pixabay.com/photo/2023/12/07/19/45/tiger-8436227_640.jpg
https://cdn.pixabay.com/photo/2017/01/12/21/42/tiger-1975790_640.jpg
https://cdn.pixabay.com/photo/2016/11/29/10/07/tiger-1868911_640.jpg
https://cdn.pixabay.com/photo/2018/05/23/18/54/tiger-3424791_640.jpg
https://cdn.pixabay.com/photo/2013/07/19/00/18/tiger-165189_640.jpg
https://cdn.pixabay.com/photo/2023/03/30/18/27/animal-7888465_640.jpg
https://cdn.pixabay.com/photo/2016/07/18/20/30/tiger-1526704_640.png
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
https://pixabay.com/static/img/blank.gif
.............
https://cdn.pixabay.com/photo/2024/05/12/09/03/ai-generated-8756430_640.jpg
https://cdn.pixabay.com/photo/2024/04/19/16/06/ai-generated-8706603_640.jpg
- 너구리 코드
더보기
from urllib import parse
import os
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller
import time
from selenium import webdriver
from urllib.request import Request, urlopen
def crawl_and_save_image(keyword, pages):
image_urls = []
driver = webdriver.Chrome()
for page in range(1, pages+1):
url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={page}'
driver.get(url)
time.sleep(3)
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')
for image_element in image_elements:
image_url = image_element.get_attribute('data-lazy-src')
if image_url is None:
image_url = image_element.get_attribute('src')
image_urls.append(image_url)
// 폴더 있는지 확인
// 없으면 만들기
if not ps.path.exists(keyword):
os.mkdir(keyword)
for i in range(len(image_urls)):
image_url = image_urls[i]
url = parse.urlparse(image_url)
filename = image_url.split('/')[-1]
image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'})
f = open(f'./{keyword}/{filename}', 'wb')
f.write(urlopen(image_byte).read())
f.close()
crawl_and_save_image('너구리', 3)
- 함수로 리팩토링
import chromedriver_autoinstaller
import time
from selenium import webdriver
from urllib.request import Request, urlopen
driver = webdriver.Chrome()
driver.get(url)
# 이미지 주소 가져오기
image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div/div[1]/div/a/img'
image_url = driver.find_element('xpath', image_xpath).get_attribute('src')
# 이미지 파일 저장
image_byte = Request(image_url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'})
f = open('dog.jpg', 'wb')
f.write(urlopen(image_byte).read())
f.close()
'데이터분석 > 크롤링' 카테고리의 다른 글
03. 인스타그램 (0) | 2024.05.21 |
---|---|
02. Selenium, Xpath (네이버웹툰 크롤링) (0) | 2024.05.21 |
01. 크롤링(Crawling) (0) | 2024.05.20 |