-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
92 lines (73 loc) · 3.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#import libraries
import os
import PyPDF2
import textract
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import re
# Load the Hugging Face job descriptions dataset
job_descriptions = load_dataset("jacob-hugging-face/job-descriptions")
# Select 10-15 job descriptions
job_descriptions = job_descriptions['train']['job_description'][:15]
# Initialize DistilBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
try:
text = textract.process(pdf_path).decode("utf-8")
except Exception:
text = ""
return text
# Preprocess text data
def preprocess_text(text):
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
return text
# Define a function to calculate similarity between CV and job descriptions
def calculate_similarity(cv_embedding, job_description_embedding):
similarity_scores = cosine_similarity(cv_embedding.reshape(1, -1), job_description_embedding.reshape(1, -1))
return similarity_scores[0][0]
# Specify the folder containing CVs in PDF format
cv_folder = "/content/data/data/ACCOUNTANT" # Replace with the path to your CVs
# List to store the top 5 CVs for each job description
top_cv_matches = []
# Loop through each job description
for job_description in job_descriptions:
job_description = preprocess_text(job_description)
# Encode the job description
job_description_encoding = tokenizer.encode(job_description, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
# Calculate embeddings for the job description
job_description_embedding = model(job_description_encoding).last_hidden_state.mean(dim=1).detach().numpy()
# Dictionary to store CVs and their similarity scores
cv_similarities = {}
# Loop through CVs in the folder
for cv_file in os.listdir(cv_folder):
cv_path = os.path.join(cv_folder, cv_file)
cv_text = extract_text_from_pdf(cv_path)
cv_text = preprocess_text(cv_text)
# Encode the CV
cv_encoding = tokenizer.encode(cv_text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
# Calculate embeddings for the CV
cv_embedding = model(cv_encoding).last_hidden_state.mean(dim=1).detach().numpy()
# Calculate cosine similarity
similarity_scores = calculate_similarity(cv_embedding, job_description_embedding)
# Store the CV and its similarity score
cv_similarities[cv_file] = similarity_scores
# Sort CVs by similarity score and select the top 5
top_cv_matches.append(
{
"job_description": job_description,
"top_cv_matches": dict(sorted(cv_similarities.items(), key=lambda item: item[1], reverse=True)[:5])
}
)
# Print the top 5 CVs for each job description
for i, job_matches in enumerate(top_cv_matches):
print(f"Job Description {i + 1}:\n{job_matches['job_description']}\n")
print("Top 5 CV Matches:")
for cv_file, similarity in job_matches['top_cv_matches'].items():
print(f"CV: {cv_file}, Similarity Score: {similarity:.4f}")
print("\n")