Notice
Recent Posts
Recent Comments
Link
«   2024/05   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
Tags
more
Archives
Today
Total
관리 메뉴

travel data science

잡코리아 채용정보 크롤링 본문

project

잡코리아 채용정보 크롤링

가방이 2022. 1. 6. 13:42

정보를 한정적으로 크롤링했습니다.

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
from time import sleep
import ast

#f=open('corporation.csv', 'w', encoding='euc-kr', newline='')
#wr=csv.writer(f)
#f.close()

dir_driver="./chromedriver"
driver=webdriver.Chrome(dir_driver)
sleep(0.5)

base_url="https://www.jobkorea.co.kr/recruit/joblist?menucode=local&localorder=1"

middle_lala=(12, 13, 16, 19, 14, 15, 22, 18, 17, 23, 21, 24, 20, 25)
lala={1:7, 2:3, 3:17, 4:8, 5: 8,  6:11, 7:9, 8:7, 9:4, 10:9, 11:6, 12:8, 13:11, 14:11}

for i in range(1, 15):
    driver.get(base_url)
    driver.find_element_by_xpath('//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[2]/dd[2]/div[2]/dl[1]/dd/div[1]/ul/li[13]').click()
    sleep(0.1)
    driver.find_element_by_xpath('//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[1]/dt').click()
    sleep(0.1)
    big_xpath='//*[@id="devSearchForm"]/div[2]/div/div[1]/dl[1]/dd[2]/div[2]/dl[1]/dd/div[1]/ul/li['+str(i)+']'
    driver.find_element_by_xpath(big_xpath).click()
    
    job_soup=bs(driver.page_source, 'html.parser')
    job_lists=job_soup.select("#devSearchForm > div.detailArea > div > div:nth-child(1) > dl.job.circleType.dev-tab.dev-duty.on > dd.ly_sub > div.ly_sub_cnt.colm3-ty1.clear > dl:nth-child(1) > dd > div.nano-content.dev-main")
    
    for job_list in job_lists:
        job_name=job_list.select_one("#devSearchForm > div.detailArea > div > div:nth-child(1) > dl.job.circleType.dev-tab.dev-duty.on > dd.ly_sub > div.ly_sub_cnt.colm3-ty1.clear > dl:nth-child(1) > dd > div.nano-content.dev-main > ul > li:nth-child("+str(i)+")")['data-value-json']
        job_name=ast.literal_eval(job_name)
        job_name=job_name['groupName']

    for k in range(1, lala[i]+1):
        middle_xpath='//*[@id="duty_step2_100'+str(middle_lala[i-1])+'_ly"]/li['+str(k)+']'
        driver.find_element_by_xpath(middle_xpath).click()
        sleep(0.1)
        
    driver.find_element_by_xpath('//*[@id="devCndtDispArea"]/div/dl[1]/dd[2]').click()
    sleep(0.1)
    
    soup=bs(driver.page_source, 'html.parser')
    lists=soup.select("#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr")

    for list in lists:
        list_corp=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplCo > a').get_text()
        list_title=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > strong > a')['title']
        list_href=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > strong > a')['href']
        list_url='https://www.jobkorea.co.kr'+list_href
        list_years=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > p.etc > span:nth-child(1)').get_text()
        list_money=list.select_one('#dev-gi-list > div > div.tplList.tplJobList > table > tbody > tr > td.tplTit > div > p.etc > span:nth-child(5)')
        if list_money != None:
            list_money=list_money.get_text()
            
        print(job_name, list_corp, list_title, list_href, list_url, list_years, list_money)
        print("======================")
        
        f = open('craw.csv', 'a+', newline='', encoding = "euc-kr")
        wr = csv.writer(f)
        wr.writerow([job_name, list_corp, list_title, list_href, list_url, list_years, list_money])
        f.close()

print('종료')
driver.close()

궁금한거 있으면 댓글 다세요