travel data science
잡코리아 채용정보 크롤링 본문
정보를 한정적으로 크롤링했습니다.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
from time import sleep
import ast
#f=open('corporation.csv', 'w', encoding='euc-kr', newline='')
#wr=csv.writer(f)
#f.close()
dir_driver="./chromedriver"
driver=webdriver.Chrome(dir_driver)
sleep(0.5)
base_url="https://www.jobkorea.co.kr/recruit/joblist?menucode=local&localorder=1"
middle_lala=(12, 13, 16, 19, 14, 15, 22, 18, 17, 23, 21, 24, 20, 25)
lala={1:7, 2:3, 3:17, 4:8, 5: 8, 6:11, 7:9, 8:7, 9:4, 10:9, 11:6, 12:8, 13:11, 14:11}
for i in range(1, 15):
driver.get(base_url)
driver.find_element_by_xpath('//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[2]/dd[2]/div[2]/dl[1]/dd/div[1]/ul/li[13]').click()
sleep(0.1)
driver.find_element_by_xpath('//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[1]/dt').click()
sleep(0.1)
big_xpath='//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[1]/dd[2]/div[2]/dl[1]/dd/div[1]/ul/li['+str(i)+']'
driver.find_element_by_xpath(big_xpath).click()
job_soup=bs(driver.page_source, 'html.parser')
job_lists=job_soup.select("#devSearchForm > div.detailArea > div > div:nth-child(1) > dl.job.circleType.dev-tab.dev-duty.on > dd.ly_sub > div.ly_sub_cnt.colm3-ty1.clear > dl:nth-child(1) > dd > div.nano-content.dev-main")
for job_list in job_lists:
job_name=job_list.select_one("#devSearchForm > div.detailArea > div > div:nth-child(1) > dl.job.circleType.dev-tab.dev-duty.on > dd.ly_sub > div.ly_sub_cnt.colm3-ty1.clear > dl:nth-child(1) > dd > div.nano-content.dev-main > ul > li:nth-child("+str(i)+")")['data-value-json']
job_name=ast.literal_eval(job_name)
job_name=job_name['groupName']
for k in range(1, lala[i]+1):
middle_xpath='//*[@id="duty_step2_100'+str(middle_lala[i-1])+'_ly"]/li['+str(k)+']'
driver.find_element_by_xpath(middle_xpath).click()
sleep(0.1)
driver.find_element_by_xpath('//*[@id="devCndtDispArea"]/div/dl[1]/dd[2]').click()
sleep(0.1)
soup=bs(driver.page_source, 'html.parser')
lists=soup.select("#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr")
for list in lists:
list_corp=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplCo > a').get_text()
list_title=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > strong > a')['title']
list_href=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > strong > a')['href']
list_url='https://www.jobkorea.co.kr'+list_href
list_years=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > p.etc > span:nth-child(1)').get_text()
list_money=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > p.etc > span:nth-child(5)')
if list_money != None:
list_money=list_money.get_text()
print(job_name, list_corp, list_title, list_href, list_url, list_years, list_money)
print("======================")
f = open('craw.csv', 'a+', newline='', encoding = "euc-kr")
wr = csv.writer(f)
wr.writerow([job_name, list_corp, list_title, list_href, list_url, list_years, list_money])
f.close()
print('종료')
driver.close()
궁금한거 있으면 댓글 다세요