-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
93 lines (71 loc) · 2.83 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# parser.py
import spacy
import re
from resume_handler import extract_text
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")
# Predefined list of skills
SKILLS_LIST = [
"Python", "Java", "JavaScript", "C++", "C#", "React", "Node.js", "SQL",
"Django", "Flask", "HTML", "CSS", "Ruby", "PHP", "Swift", "Kotlin", "AWS", "Docker"
]
def extract_email_and_phone(lines):
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
email = None
phone = None
for line in lines:
if not email and re.search(email_pattern, line):
email = re.search(email_pattern, line).group()
if not phone and re.search(phone_pattern, line):
phone = re.search(phone_pattern, line).group()
return email, phone
def parse_resume(file_path):
# Extract text from the resume
resume_text = extract_text(file_path)
# Normalize the text
resume_text = resume_text.lower()
# Process the text with SpaCy
doc = nlp(resume_text)
# Initialize data structure to hold extracted information
parsed_data = {
"name": None,
"email": None,
"phone": None,
"skills": [],
"experience": [],
"education": [],
"certifications": []
}
# Split text into lines for easier processing
lines = resume_text.split('\n')
# Extract name (assuming it's the first line)
parsed_data["name"] = lines[0].strip()
# Extract email and phone
parsed_data["email"], parsed_data["phone"] = extract_email_and_phone(lines)
# Extract skills using NER and predefined list
for token in doc:
if token.text in SKILLS_LIST:
parsed_data["skills"].append(token.text.capitalize())
# Extract experience and education based on keywords
experience_keywords = ["experience", "work", "employment"]
education_keywords = ["education", "degree", "school", "university", "college"]
current_section = None
for line in lines:
line_lower = line.lower()
if any(keyword in line_lower for keyword in experience_keywords):
current_section = "experience"
elif any(keyword in line_lower for keyword in education_keywords):
current_section = "education"
elif current_section == "experience" and line.strip():
parsed_data["experience"].append(line.strip())
elif current_section == "education" and line.strip():
parsed_data["education"].append(line.strip())
elif current_section == "certifications" and line.strip():
parsed_data["certifications"].append(line.strip())
return parsed_data
# Example usage
if __name__ == "__main__":
file_path = "resume-dataset/Resume2.pdf" # Change to your resume path
parsed_resume = parse_resume(file_path)
print(parsed_resume)