From a1cc1706ffab7d2527a49d4061b20ae792ba9cc3 Mon Sep 17 00:00:00 2001
From: Subramanyam Challa
<46422196+SubramanyamChalla24@users.noreply.github.com>
Date: Wed, 15 May 2024 01:56:27 -0600
Subject: [PATCH] removed cohere dependency , update codes . (#261)
---
CONTRIBUTING.md | 28 ----
README.md | 30 ----
requirements.txt | 1 -
scripts/similarity/get_score.ipynb | 226 +++++++++++++++++++++++++++++
4 files changed, 226 insertions(+), 59 deletions(-)
create mode 100644 scripts/similarity/get_score.ipynb
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index fa7b9e85..648956f5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -167,34 +167,6 @@ The full stack Next.js (React and FastAPI) web application allows users to inter
To run the full stack web application (frontend client and backend api servers), follow the instructions over on the [webapp README](/webapp/README.md) file.
-
-
-### Cohere and Qdrant
-
-1. Visit [Cohere website registration](https://dashboard.cohere.ai/welcome/register) and create an account.
-2. Go to API keys and copy your cohere api key.
-3. Visit [Qdrant website](https://cloud.qdrant.io/) and create an account.
-4. Get your api key and cluster url.
-5. Go to open dashboard in qdrant and enter your api key **for only the first time**
-
-
-
-6. Now create a yaml file named config.yml in Scripts/Similarity/ folder.
-
-7. The format for the conifg file should be as below:
- ```yaml
- cohere:
- api_key: cohere_key
- qdrant:
- api_key: qdrant_api_key
- url: qdrant_cluster_url
- ```
-8. Please replace your values without any quotes.
-
-*Note: Please make sure that Qdrant_client's version is higher than v1.1*
-
-
-
## Code Formatting
This project uses [Black](https://black.readthedocs.io/en/stable/) for code formatting. We believe this helps to keep the code base consistent and reduces the cognitive load when reading code.
diff --git a/README.md b/README.md
index 33eaebea..f55c70a6 100644
--- a/README.md
+++ b/README.md
@@ -198,35 +198,6 @@ To run the full stack web application (frontend client and backend api servers),
5. Copy the url and open it in your browser.
6. ![img_2.png](img_2.png)
-### Cohere and Qdrant
-
-1. Visit [Cohere website registration](https://dashboard.cohere.ai/welcome/register) and create an account.
-2. Go to API keys and copy your cohere api key.
-3. Visit [Qdrant website](https://cloud.qdrant.io/) and create an account.
-4. Get your api key and cluster url.
-5. Go to open dashboard in qdrant and enter your api key **for only the first time**
-
-1. Now create a yaml file named config.yml in Scripts/Similarity/ folder.
-2. The format for the conifg file should be as below:
- ```yaml
- cohere:
- api_key: cohere_key
- qdrant:
- api_key: qdrant_api_key
- url: qdrant_cluster_url
- ```
-3. Please replace your values without any quotes.
-
-*Note: Please make sure that Qdrant_client's version is higher than v1.1*
-
-*Note: This part needs updating w.r.t to the new FastEmbed changes.*
-
-
-
-
-
-
-
## Code Formatting
This project uses [Black](https://black.readthedocs.io/en/stable/) for code formatting. We believe this helps to keep the code base consistent and reduces the cognitive load when reading code.
@@ -253,7 +224,6 @@ Now, the pre-commit hooks will automatically run every time you commit your chan
## Join Us, Contribute!
-
Pull Requests & Issues are not just welcomed, they're celebrated! Let's create together.
diff --git a/requirements.txt b/requirements.txt
index 706f84b8..d2c7284b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -113,5 +113,4 @@ reportlab==3.6.13
easygui==0.98.3
xhtml2pdf==0.2.11
fastembed~=0.2.2
-cohere==5.3.2
qdrant-client==1.8.2
\ No newline at end of file
diff --git a/scripts/similarity/get_score.ipynb b/scripts/similarity/get_score.ipynb
new file mode 100644
index 00000000..6284bdfa
--- /dev/null
+++ b/scripts/similarity/get_score.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "id": "initial_id",
+ "metadata": {
+ "collapsed": true,
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:31:25.398358Z",
+ "start_time": "2024-05-14T22:31:22.567855Z"
+ }
+ },
+ "source": [
+ "import json\n",
+ "import logging\n",
+ "import os\n",
+ "\n",
+ "import yaml\n",
+ "from qdrant_client import QdrantClient\n",
+ "from scripts.utils.logger import init_logging_config\n"
+ ],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001B[32m2024-05-14 15:31:22.994\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mfastembed.embedding\u001B[0m:\u001B[36m\u001B[0m:\u001B[36m7\u001B[0m - \u001B[33m\u001B[1mDefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated.Use from fastembed import TextEmbedding instead.\u001B[0m\n"
+ ]
+ }
+ ],
+ "execution_count": 1
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:31:28.571615Z",
+ "start_time": "2024-05-14T22:31:28.569033Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "init_logging_config(basic_log_level=logging.INFO)\n",
+ "# Get the logger\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "# Set the logging level\n",
+ "logger.setLevel(logging.INFO)"
+ ],
+ "id": "a69d1b16785bf17b",
+ "outputs": [],
+ "execution_count": 2
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def find_path(folder_name):\n",
+ " curr_dir = os.getcwd()\n",
+ " while True:\n",
+ " if folder_name in os.listdir(curr_dir):\n",
+ " return os.path.join(curr_dir, folder_name)\n",
+ " else:\n",
+ " parent_dir = os.path.dirname(curr_dir)\n",
+ " if parent_dir == \"/\":\n",
+ " break\n",
+ " curr_dir = parent_dir\n",
+ " raise ValueError(f\"Folder '{folder_name}' not found.\")\n",
+ "\n",
+ "\n",
+ "def read_config(filepath):\n",
+ " try:\n",
+ " with open(filepath) as f:\n",
+ " config = yaml.safe_load(f)\n",
+ " return config\n",
+ " except FileNotFoundError as e:\n",
+ " logger.error(f\"Configuration file {filepath} not found: {e}\")\n",
+ " except yaml.YAMLError as e:\n",
+ " logger.error(\n",
+ " f\"Error parsing YAML in configuration file {filepath}: {e}\", exc_info=True\n",
+ " )\n",
+ " except Exception as e:\n",
+ " logger.error(f\"Error reading configuration file {filepath}: {e}\")\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def read_doc(path):\n",
+ " with open(path) as f:\n",
+ " try:\n",
+ " data = json.load(f)\n",
+ " except Exception as e:\n",
+ " logger.error(f\"Error reading JSON file: {e}\")\n",
+ " data = {}\n",
+ " return data\n"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:31:31.783421Z",
+ "start_time": "2024-05-14T22:31:31.763842Z"
+ }
+ },
+ "id": "899904f870d3f720",
+ "outputs": [],
+ "execution_count": 3
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def get_score(resume_string, job_description_string):\n",
+ " logger.info(\"Started getting similarity score\")\n",
+ "\n",
+ " documents = [resume_string]\n",
+ " client = QdrantClient(\":memory:\")\n",
+ " client.set_model(\"BAAI/bge-base-en\")\n",
+ "\n",
+ " client.add(\n",
+ " collection_name=\"demo_collection\",\n",
+ " documents=documents,\n",
+ " )\n",
+ "\n",
+ " search_result = client.query(\n",
+ " collection_name=\"demo_collection\", query_text=job_description_string\n",
+ " )\n",
+ " logger.info(\"Finished getting similarity score\")\n",
+ " return search_result"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:31:33.976354Z",
+ "start_time": "2024-05-14T22:31:33.972943Z"
+ }
+ },
+ "id": "c54a86727f436747",
+ "outputs": [],
+ "execution_count": 4
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "cwd = find_path('Resume-Matcher')\n",
+ "READ_RESUME_FROM = os.path.join(cwd, 'Data', 'Processed', 'Resumes')\n",
+ "READ_JOB_DESCRIPTION_FROM = os.path.join(cwd, 'Data', 'Processed', 'JobDescription')\n",
+ "config_path = os.path.join(cwd, \"scripts\", \"similarity\")"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:31:40.945947Z",
+ "start_time": "2024-05-14T22:31:40.934732Z"
+ }
+ },
+ "id": "ae15b96094e7c460",
+ "outputs": [],
+ "execution_count": 5
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# To give your custom resume use this code\n",
+ "resume_dict = read_config(\n",
+ " READ_RESUME_FROM\n",
+ " + \"/Resume-alfred_pennyworth_pm.pdf83632b66-5cce-4322-a3c6-895ff7e3dd96.json\"\n",
+ ")\n",
+ "job_dict = read_config(\n",
+ " READ_JOB_DESCRIPTION_FROM\n",
+ " + \"/JobDescription-job_desc_product_manager.pdf6763dc68-12ff-4b32-b652-ccee195de071.json\"\n",
+ ")\n",
+ "resume_keywords = resume_dict[\"extracted_keywords\"]\n",
+ "job_description_keywords = job_dict[\"extracted_keywords\"]\n",
+ "\n",
+ "resume_string = \" \".join(resume_keywords)\n",
+ "jd_string = \" \".join(job_description_keywords)\n",
+ "final_result = get_score(resume_string, jd_string)\n",
+ "for r in final_result:\n",
+ " print(r.score)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-05-14T22:32:29.666678Z",
+ "start_time": "2024-05-14T22:32:28.781841Z"
+ }
+ },
+ "id": "2f531db076cc1f59",
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001B[32;10m2024-05-14 15:32:28,825 (1835371807.py:2) - INFO: \u001B[0mStarted getting similarity score\n",
+ "\u001B[32;10m2024-05-14 15:32:29,664 (1835371807.py:16) - INFO: \u001B[0mFinished getting similarity score\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.8572231574198962\n"
+ ]
+ }
+ ],
+ "execution_count": 6
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}