In [6]:
import numpy as np
from scipy.linalg import sqrtm
# Given matrix sigma inverse
sigma = np.array([[8, -2], [-2, 5]])
# Calculate the square root of the inverse of sigma
sigma_inv = np.linalg.inv(sigma)
sigma_inv
Out[6]:
array([[0.13888889, 0.05555556], [0.05555556, 0.22222222]])
In [7]:
sigma_inv_sqrt = sqrtm(sigma_inv)
print("Square root of the inverse of sigma:")
print(sigma_inv_sqrt)
Square root of the inverse of sigma: [[0.36666667 0.06666667] [0.06666667 0.46666667]]
In [1]:
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import re
import os
import glob
import shutil
import openpyxl
from openpyxl import Workbook
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from pathlib import Path
from datetime import datetime, timedelta
os.getcwd()
pd.options.display.max_rows = 1000
In [2]:
# create the input_dir(input directory)
source_path = os.path.dirname(os.path.abspath('__file__'))
# source_path = '/scratch/bell/sido/m&a'
INPUT_DIR = os.path.join(source_path, 'data')
# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
os.mkdir(INPUT_DIR)
# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(source_path, 'outputs')
# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
os.mkdir(OUTPUT_DIR)
In [4]:
# Once you run this code, comment it out
# move csv files to `data` directory(=folder)
# unique_dir_names = []
# for f in Path(f'{source_path}').rglob('*.csv'):
# unique_dir_names.append(f)
# for g in Path(f'{source_path}').rglob('*.xlsx'):
# unique_dir_names.append(g)
# for file in list(set(unique_dir_names)):
# print(f'moved file: {file}')
# shutil.move(f'{file}', f'{INPUT_DIR}')
moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx moved file: /Users/satoshiido/Documents/programming/RA/pharma/~$temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/USPTOtoCRSP_Pharma_Dictionary.csv moved file: /Users/satoshiido/Documents/programming/RA/pharma/USPTO_firm_list.xlsx moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/same_orgname_dif_assigneeid_pharma.csv moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/duplicated_cleaned_pharma.csv moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/pharma-not-in-ma-list-20230416.csv
In [3]:
# Pandas function to let us read csv files without having to specify the directory
def read_csv(name, **kwrgs):
path = os.path.join(INPUT_DIR, name + '.csv')
print(f'Load: {path}')
return pd.read_csv(path, **kwrgs)
# Polars function to let us read csv files wi`thout having to specify the directory
def read_csvpl(name, **kwrgs):
path = os.path.join(INPUT_DIR, name + '.csv')
print(f'Load: {path}')
return pl.read_csv(path, **kwrgs)
DATA IMPORT¶
In [15]:
#convert xlsx to csv
# firm_dir = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "USPTO_firm_list.xlsx")))
# firm_dir.to_csv(os.path.join(INPUT_DIR, "uspto_firm_list.csv"), encoding='utf-8', index=False)
# temproster = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx")))
# temproster.to_csv(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv"), encoding='utf-8', index=False)
In [4]:
firm_dir = read_csv('uspto_firm_list')
temproster = read_csv('temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist')
Load: /Users/satoshiido/Documents/programming/RA/pharma/data/uspto_firm_list.csv Load: /Users/satoshiido/Documents/programming/RA/pharma/data/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv
PREPROCESS¶
In [5]:
# Check for non-string or missing values in the 'organization' column
firm_dir[firm_dir['organization'].apply(lambda x: not isinstance(x, str))]
Out[5]:
assignee_id | organization | |
---|---|---|
106094 | 4852ee47-e206-4b18-8fbd-3027c7020c3c | NaN |
279157 | be8d184f-ad1e-4cce-ba25-9588bf36c054 | NaN |
326732 | ded97c9f-f562-49f6-bbfa-7591b6c5d6d1 | NaN |
In [6]:
# fill NaN values with an empty string
firm_dir['organization'].fillna("", inplace=True)
In [7]:
# regular expression to remove the following words from the company name
def clean_company_name(name):
# extend the regular expression to include 'A/S' and use a non-capturing group for postfixes
# the regex will handle cases like "Co. Ltd." to remove following words
postfix_pattern = re.compile(
r'(,?\b\s*(Inc(?=\W|$)\.?|Ltd(?=\W|$)\.?|LLC|LTD|GmbH|\& Co\.?\s*|\bCo\b\s*|\bCo\.\s*|Corp|Corporation|S\.A\.|S\.P\.A\.|S\.A\.S\.|S\.R\.L\.|LLP|LP|'
r'S\.L\.|Oyj|Zrt|Pty|Kg|Kgaa|N\.V\.|HF|BVBA|B\.V\.|S\.E\.|PLC|PBC|S\.C\.A\.|S\.E\.M\.C\.O\.|S\.E\.C\.|'
r'Limited).*)', re.IGNORECASE)
# substitute found patterns with an empty string
cleaned_name = re.sub(postfix_pattern, "", name).strip()
cleaned_name = cleaned_name.lower()
return cleaned_name
In [8]:
# apply the updated function to the dataframe
firm_dir["cleaned_organization"] = firm_dir["organization"].apply(clean_company_name)
temproster["cleaned_TargetName"] = temproster["TargetName"].apply(clean_company_name)
In [9]:
temproster.head(20)
Out[9]:
TargetName | cleaned_TargetName | |
---|---|---|
0 | 3D Systems Corp | 3d systems |
1 | AB SCIEX | ab sciex |
2 | ACT Gene SA | act gene sa |
3 | ADAC Laboratories | adac laboratories |
4 | AECI Ltd | aeci |
5 | AEG Elektrofotografie GmbH | aeg elektrofotografie |
6 | AFx Inc | afx |
7 | ALIGN Pharmaceuticals LLC | align pharmaceuticals |
8 | ALZA Corp | alza |
9 | AMAG Pharmaceuticals Inc | amag pharmaceuticals |
10 | AMICAS Inc | amicas |
11 | AMRESCO Inc | amresco |
12 | ANC LLC | anc |
13 | APEIRON Biologics AG | apeiron biologics ag |
14 | APEX Intl Clinical Research Co | apex intl clinical research |
15 | ARIAD Gene Therapeutics Inc | ariad gene therapeutics |
16 | ATMI Inc | atmi |
17 | AbVitro Inc | abvitro |
18 | Abaxis Inc | abaxis |
19 | AbbVie Inc | abbvie |
In [196]:
temproster.to_csv(os.path.join(INPUT_DIR, "temproster.csv"), encoding='utf-8', index=False)
firm_dir.to_csv(os.path.join(INPUT_DIR, "firm_dir.csv"), encoding='utf-8', index=False)
MATCHING¶
Fuzzy matching¶
In [16]:
# rename the new column to 'name'
temproster2 = temproster.rename(columns={"cleaned_TargetName": "name"})
In [17]:
temproster2
Out[17]:
TargetName | name | |
---|---|---|
0 | 3D Systems Corp | 3d systems |
1 | AB SCIEX | ab sciex |
2 | ACT Gene SA | act gene sa |
3 | ADAC Laboratories | adac laboratories |
4 | AECI Ltd | aeci |
... | ... | ... |
1473 | iviGene Corp | ivigene |
1474 | microcuff GmbH | microcuff |
1475 | nanoTechnology Systems Pty Ltd | nanotechnology systems |
1476 | picoSpin LLC | picospin |
1477 | superDimension Ltd | superdimension |
1478 rows × 2 columns
In [ ]:
# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
print(all_matches)
# Filter matches based on the threshold
# valid_matches = [match for match, score in all_matches if score >= threshold]
return print(all_matches)
# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)
In [ ]:
# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
# Filter matches based on the threshold
valid_matches = [match for match, score in all_matches if score >= threshold]
return valid_matches
# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
match_list = get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)
for match in match_list:
matches.append({"temproster_TargetName": row["name"], "firm_dir_organization": match, "firm_dir_assignee_id": firm_dir[firm_dir["cleaned_organization"] == match]["assignee_id"].values[0]})
# convert the list of match dictionaries to a DataFrame
matches_df = pd.DataFrame(matches)
Exact matching¶
In [18]:
# simple merge
merged_df = pd.merge(temproster2, firm_dir, left_on="name", right_on="cleaned_organization", how="left")
In [19]:
# rename the new column to 'name' and drop the data from firm_dir that was merged if it was not a match
merged_df2 = merged_df.rename(columns={"name": "cleaned_TargetName"}).dropna(subset=["cleaned_organization"])[["TargetName", "cleaned_TargetName", "organization", "cleaned_organization", "assignee_id"]]
merged_df2.head(20)
Out[19]:
TargetName | cleaned_TargetName | organization | cleaned_organization | assignee_id | |
---|---|---|---|---|---|
1 | AB SCIEX | ab sciex | AB Sciex LLC | ab sciex | 52baa10b-d9bd-4fc7-88ad-3e25d4de3141 |
4 | AECI Ltd | aeci | AECI Limited | aeci | f8bec60a-cb40-4009-837b-f5d557b1e434 |
5 | AEG Elektrofotografie GmbH | aeg elektrofotografie | AEG Elektrofotografie GmbH | aeg elektrofotografie | e2d3a3be-7f93-4e6a-8921-04992b68e048 |
6 | AFx Inc | afx | AFX Inc. | afx | b57b3456-5dc2-4f40-8be0-089cdbd2b027 |
8 | ALZA Corp | alza | ALZA CORPORATION | alza | aad595fd-89b4-4aa0-8992-0d36cadf5ac0 |
11 | AMRESCO Inc | amresco | Amresco Inc. | amresco | 89025abc-6c63-4c79-9fc3-64785dfba78f |
13 | APEIRON Biologics AG | apeiron biologics ag | APEIRON BIOLOGICS AG | apeiron biologics ag | 41af2c4a-5a82-44fa-83fb-2997036b4bca |
16 | ATMI Inc | atmi | ATMI BVBA | atmi | 470c6c47-1290-464e-b289-449b3e9defa4 |
17 | AbVitro Inc | abvitro | AbVitro LLC | abvitro | 1e916abd-b65b-486d-a72d-000b8023748e |
19 | AbbVie Inc | abbvie | AbbVie Inc. | abbvie | e047aabc-b6f3-4b91-899d-79bba32ba295 |
26 | Acceleron Pharma Inc | acceleron pharma | ACCELERON PHARMA INC. | acceleron pharma | 95a8c21b-8256-4b78-9df7-71590971eff5 |
30 | Aceto Corp | aceto | Aceto Corporation | aceto | 4236ec55-1cea-4e33-b6e7-9ee850ba117b |
36 | Acuson Corp | acuson | ACUSON | acuson | e8dcf764-426b-4921-a526-b189a2d335ef |
53 | Aegis Analytical Corp | aegis analytical | AEGIS ANALYTICAL CORPORATION | aegis analytical | 145729fe-c0af-4493-9ff6-13d29777dcdd |
56 | AesRx LLC | aesrx | AesRx LLC | aesrx | ec642047-4c94-490f-871b-02c9d2a150c7 |
62 | Agencourt Bioscience Corp | agencourt bioscience | Agencourt Bioscience Corporation | agencourt bioscience | d4c083f5-faba-4c5a-9778-977ee9b14453 |
66 | Agro Dynamics Corp | agro dynamics | Agro Dynamics | agro dynamics | 3be10411-2968-4cda-903b-d1cc5163b1a5 |
68 | Ahura Scientific Inc | ahura scientific | Ahura Scientific Inc. | ahura scientific | 5cec5486-f446-49a4-9165-d5c29da66727 |
78 | Alanex Corp | alanex | Alanex Corporation | alanex | 07db9c5d-8926-46be-a5ec-70ab3658b6c2 |
80 | Albemarle Corp | albemarle | ALBEMARLE CORPORATION | albemarle | 9f083918-5ae1-4692-80a8-6023c18aca79 |
In [204]:
merged_df2.to_csv(os.path.join(OUTPUT_DIR, "merged_df.csv"), encoding='utf-8', index=False)
merged_df2.to_excel(os.path.join(OUTPUT_DIR, "merged_df.xlsx"), index=False)
In [ ]: