129 lines
5.1 KiB
Python
129 lines
5.1 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
|
|
# Base URL of the job listings page for software development jobs
|
|
base_url = 'https://www.jobindex.dk/jobsoegning/it/'
|
|
|
|
# Keywords to include in the job listings
|
|
include_keywords = ['software', 'nextjs', 'svelte']
|
|
|
|
# Keywords to exclude from the job titles
|
|
exclude_keywords = ['senior']
|
|
|
|
# File path for the JSON database
|
|
db_file = 'jobs_db.json'
|
|
|
|
# Load existing jobs from the JSON file if it exists
|
|
if os.path.exists(db_file):
|
|
try:
|
|
with open(db_file, 'r') as file:
|
|
existing_jobs = json.load(file)
|
|
print(f"Loaded {len(existing_jobs)} existing jobs from {db_file}")
|
|
except json.JSONDecodeError:
|
|
print("Error reading JSON file. Starting with an empty list.")
|
|
existing_jobs = []
|
|
else:
|
|
print("JSON file not found. Starting with an empty list.")
|
|
existing_jobs = []
|
|
|
|
# Convert existing jobs to a set of IDs for duplicate checking
|
|
existing_job_ids = set(job['id'] for job in existing_jobs)
|
|
|
|
# Function to fetch and parse a page
|
|
def fetch_jobs_from_page(url):
|
|
response = requests.get(url)
|
|
if response.status_code != 200:
|
|
raise Exception(f"Failed to load page {url}")
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Function to extract jobs from the parsed HTML
|
|
def extract_jobs(soup):
|
|
job_listings = soup.find_all('div', class_='jobsearch-result')
|
|
jobs = []
|
|
today = datetime.today()
|
|
for job in job_listings:
|
|
title_element = job.find('h4').find('a')
|
|
company_element = job.find('div', class_='jix-toolbar-top__company')
|
|
location_element = job.find('span', class_='jix_robotjob--area')
|
|
date_posted_element = job.find('time')
|
|
job_link_element = job.find('h4').find('a')
|
|
description_element = job.find('p')
|
|
|
|
title = title_element.get_text(strip=True) if title_element else ''
|
|
url = company_element.find('a', rel='noopener')['href'] if company_element and company_element.find('a', rel='noopener') else ''
|
|
img = f"https://www.jobindex.dk{job.find('img')['src']}" if job.find('img') else ''
|
|
company = company_element.find('a').get_text(strip=True) if company_element else ''
|
|
location = location_element.get_text(strip=True) if location_element else ''
|
|
date_posted = date_posted_element['datetime'] if date_posted_element else ''
|
|
job_link = job_link_element['href'] if job_link_element else ''
|
|
description = description_element.get_text(strip=True) if description_element else ''
|
|
|
|
print(f"Debug: title={title}, url={url}, img={img}, company={company}, location={location}, date_posted={date_posted}, job_link={job_link}, description={description}") # Detailed debug print
|
|
|
|
# Convert date_posted to datetime object
|
|
try:
|
|
date_posted_dt = datetime.strptime(date_posted, '%Y-%m-%d')
|
|
if (today - date_posted_dt).days > 3:
|
|
print("Job older than 3 days found. Stopping the scraper.")
|
|
return jobs, False # Returning jobs and False to indicate stopping
|
|
except ValueError:
|
|
continue
|
|
|
|
combined_text = f"{title} {description}".lower()
|
|
if not any(exclude_keyword in title.lower() for exclude_keyword in exclude_keywords) and any(include_keyword in combined_text for include_keyword in include_keywords):
|
|
job_id = f"{title}-{url}-{location}-{date_posted}"
|
|
if job_id not in existing_job_ids:
|
|
jobs.append({
|
|
'id': job_id,
|
|
'title': title,
|
|
'url': url,
|
|
'img': img,
|
|
'company': company,
|
|
'location': location,
|
|
'date_posted': date_posted,
|
|
'link': job_link
|
|
})
|
|
existing_job_ids.add(job_id)
|
|
print(f"Added job: {job_id}") # Debug print for each added job
|
|
|
|
return jobs, True
|
|
|
|
# Function to find the next page URL
|
|
def get_next_page_url(soup):
|
|
next_page = soup.find('a', {'aria-label': 'Næste'})
|
|
return next_page['href'] if next_page else None
|
|
|
|
# Main scraping loop
|
|
current_url = base_url
|
|
all_jobs = []
|
|
|
|
while current_url:
|
|
print(f"Fetching jobs from: {current_url}")
|
|
soup = fetch_jobs_from_page(current_url)
|
|
jobs, continue_scraping = extract_jobs(soup)
|
|
all_jobs.extend(jobs)
|
|
print(f"Collected {len(jobs)} jobs from this page.")
|
|
if not continue_scraping:
|
|
break
|
|
current_url = get_next_page_url(soup)
|
|
|
|
# Combine the existing jobs with the new jobs
|
|
all_jobs = existing_jobs + all_jobs
|
|
|
|
# Remove jobs older than 30 days from the combined list
|
|
cutoff_date = datetime.today() - timedelta(days=30)
|
|
all_jobs = [job for job in all_jobs if datetime.strptime(job['date_posted'], '%Y-%m-%d') >= cutoff_date]
|
|
|
|
# Final debug print before saving
|
|
print(f"Total jobs to be saved: {len(all_jobs)}")
|
|
print(f"Jobs to be saved: {all_jobs}") # Debug print to show jobs to be saved
|
|
|
|
# Save the new jobs to the JSON file
|
|
with open(db_file, 'w') as file:
|
|
json.dump(all_jobs, file, indent=4)
|
|
|
|
print(f"Total jobs saved: {len(all_jobs)}") # Final output after saving to file
|