In [6]:
import numpy as np
from scipy.linalg import sqrtm

# Given matrix sigma inverse
sigma = np.array([[8, -2], [-2, 5]])

# Calculate the square root of the inverse of sigma
sigma_inv = np.linalg.inv(sigma)
sigma_inv
Out[6]:
array([[0.13888889, 0.05555556],
       [0.05555556, 0.22222222]])
In [7]:
sigma_inv_sqrt = sqrtm(sigma_inv)

print("Square root of the inverse of sigma:")
print(sigma_inv_sqrt)
Square root of the inverse of sigma:
[[0.36666667 0.06666667]
 [0.06666667 0.46666667]]
In [1]:
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import re
import os
import glob
import shutil
import openpyxl
from openpyxl import Workbook
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from pathlib import Path
from datetime import datetime, timedelta
os.getcwd()
pd.options.display.max_rows = 1000
In [2]:
# create the input_dir(input directory)
source_path = os.path.dirname(os.path.abspath('__file__'))
# source_path = '/scratch/bell/sido/m&a'
INPUT_DIR = os.path.join(source_path, 'data')

# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(source_path, 'outputs')

# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
In [4]:
# Once you run this code, comment it out
# move csv files to `data` directory(=folder)

# unique_dir_names = []
# for f in Path(f'{source_path}').rglob('*.csv'):
#     unique_dir_names.append(f)
# for g in Path(f'{source_path}').rglob('*.xlsx'):
#     unique_dir_names.append(g)

# for file in list(set(unique_dir_names)):
#     print(f'moved file: {file}')
#     shutil.move(f'{file}', f'{INPUT_DIR}')
moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/~$temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/USPTOtoCRSP_Pharma_Dictionary.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/USPTO_firm_list.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/same_orgname_dif_assigneeid_pharma.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/duplicated_cleaned_pharma.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/pharma-not-in-ma-list-20230416.csv
In [3]:
# Pandas function to let us read csv files without having to specify the directory
def read_csv(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pd.read_csv(path, **kwrgs)

# Polars function to let us read csv files wi`thout having to specify the directory
def read_csvpl(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pl.read_csv(path, **kwrgs)

DATA IMPORT¶

In [15]:
#convert xlsx to csv
# firm_dir = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "USPTO_firm_list.xlsx")))
# firm_dir.to_csv(os.path.join(INPUT_DIR, "uspto_firm_list.csv"), encoding='utf-8', index=False)
# temproster = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx")))
# temproster.to_csv(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv"), encoding='utf-8', index=False)
In [4]:
firm_dir = read_csv('uspto_firm_list')
temproster = read_csv('temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist')
Load: /Users/satoshiido/Documents/programming/RA/pharma/data/uspto_firm_list.csv
Load: /Users/satoshiido/Documents/programming/RA/pharma/data/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv

PREPROCESS¶

In [5]:
# Check for non-string or missing values in the 'organization' column
firm_dir[firm_dir['organization'].apply(lambda x: not isinstance(x, str))]
Out[5]:
assignee_id organization
106094 4852ee47-e206-4b18-8fbd-3027c7020c3c NaN
279157 be8d184f-ad1e-4cce-ba25-9588bf36c054 NaN
326732 ded97c9f-f562-49f6-bbfa-7591b6c5d6d1 NaN
In [6]:
# fill NaN values with an empty string
firm_dir['organization'].fillna("", inplace=True)
In [7]:
# regular expression to remove the following words from the company name
def clean_company_name(name):
    # extend the regular expression to include 'A/S' and use a non-capturing group for postfixes
    # the regex will handle cases like "Co. Ltd." to remove following words    
    postfix_pattern = re.compile(
        r'(,?\b\s*(Inc(?=\W|$)\.?|Ltd(?=\W|$)\.?|LLC|LTD|GmbH|\& Co\.?\s*|\bCo\b\s*|\bCo\.\s*|Corp|Corporation|S\.A\.|S\.P\.A\.|S\.A\.S\.|S\.R\.L\.|LLP|LP|'
        r'S\.L\.|Oyj|Zrt|Pty|Kg|Kgaa|N\.V\.|HF|BVBA|B\.V\.|S\.E\.|PLC|PBC|S\.C\.A\.|S\.E\.M\.C\.O\.|S\.E\.C\.|'
        r'Limited).*)', re.IGNORECASE)

    # substitute found patterns with an empty string
    cleaned_name = re.sub(postfix_pattern, "", name).strip()
    cleaned_name = cleaned_name.lower()
    
    return cleaned_name
In [8]:
# apply the updated function to the dataframe
firm_dir["cleaned_organization"] = firm_dir["organization"].apply(clean_company_name)
temproster["cleaned_TargetName"] = temproster["TargetName"].apply(clean_company_name)
In [9]:
temproster.head(20)
Out[9]:
TargetName cleaned_TargetName
0 3D Systems Corp 3d systems
1 AB SCIEX ab sciex
2 ACT Gene SA act gene sa
3 ADAC Laboratories adac laboratories
4 AECI Ltd aeci
5 AEG Elektrofotografie GmbH aeg elektrofotografie
6 AFx Inc afx
7 ALIGN Pharmaceuticals LLC align pharmaceuticals
8 ALZA Corp alza
9 AMAG Pharmaceuticals Inc amag pharmaceuticals
10 AMICAS Inc amicas
11 AMRESCO Inc amresco
12 ANC LLC anc
13 APEIRON Biologics AG apeiron biologics ag
14 APEX Intl Clinical Research Co apex intl clinical research
15 ARIAD Gene Therapeutics Inc ariad gene therapeutics
16 ATMI Inc atmi
17 AbVitro Inc abvitro
18 Abaxis Inc abaxis
19 AbbVie Inc abbvie
In [196]:
temproster.to_csv(os.path.join(INPUT_DIR, "temproster.csv"), encoding='utf-8', index=False)
firm_dir.to_csv(os.path.join(INPUT_DIR, "firm_dir.csv"), encoding='utf-8', index=False)

MATCHING¶

Fuzzy matching¶

In [16]:
# rename the new column to 'name'
temproster2 = temproster.rename(columns={"cleaned_TargetName": "name"})
In [17]:
temproster2
Out[17]:
TargetName name
0 3D Systems Corp 3d systems
1 AB SCIEX ab sciex
2 ACT Gene SA act gene sa
3 ADAC Laboratories adac laboratories
4 AECI Ltd aeci
... ... ...
1473 iviGene Corp ivigene
1474 microcuff GmbH microcuff
1475 nanoTechnology Systems Pty Ltd nanotechnology systems
1476 picoSpin LLC picospin
1477 superDimension Ltd superdimension

1478 rows × 2 columns

In [ ]:
# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
    all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
    print(all_matches)
    # Filter matches based on the threshold
    # valid_matches = [match for match, score in all_matches if score >= threshold]
    return print(all_matches)

# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
    get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)
In [ ]:
# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
    all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
    # Filter matches based on the threshold
    valid_matches = [match for match, score in all_matches if score >= threshold]
    return valid_matches

# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
    match_list = get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)
    for match in match_list: 
        matches.append({"temproster_TargetName": row["name"], "firm_dir_organization": match, "firm_dir_assignee_id": firm_dir[firm_dir["cleaned_organization"] == match]["assignee_id"].values[0]})

# convert the list of match dictionaries to a DataFrame
matches_df = pd.DataFrame(matches)

Exact matching¶

In [18]:
# simple merge
merged_df = pd.merge(temproster2, firm_dir, left_on="name", right_on="cleaned_organization", how="left")
In [19]:
# rename the new column to 'name' and drop the data from firm_dir that was merged if it was not a match
merged_df2 = merged_df.rename(columns={"name": "cleaned_TargetName"}).dropna(subset=["cleaned_organization"])[["TargetName", "cleaned_TargetName", "organization", "cleaned_organization", "assignee_id"]]
merged_df2.head(20)
Out[19]:
TargetName cleaned_TargetName organization cleaned_organization assignee_id
1 AB SCIEX ab sciex AB Sciex LLC ab sciex 52baa10b-d9bd-4fc7-88ad-3e25d4de3141
4 AECI Ltd aeci AECI Limited aeci f8bec60a-cb40-4009-837b-f5d557b1e434
5 AEG Elektrofotografie GmbH aeg elektrofotografie AEG Elektrofotografie GmbH aeg elektrofotografie e2d3a3be-7f93-4e6a-8921-04992b68e048
6 AFx Inc afx AFX Inc. afx b57b3456-5dc2-4f40-8be0-089cdbd2b027
8 ALZA Corp alza ALZA CORPORATION alza aad595fd-89b4-4aa0-8992-0d36cadf5ac0
11 AMRESCO Inc amresco Amresco Inc. amresco 89025abc-6c63-4c79-9fc3-64785dfba78f
13 APEIRON Biologics AG apeiron biologics ag APEIRON BIOLOGICS AG apeiron biologics ag 41af2c4a-5a82-44fa-83fb-2997036b4bca
16 ATMI Inc atmi ATMI BVBA atmi 470c6c47-1290-464e-b289-449b3e9defa4
17 AbVitro Inc abvitro AbVitro LLC abvitro 1e916abd-b65b-486d-a72d-000b8023748e
19 AbbVie Inc abbvie AbbVie Inc. abbvie e047aabc-b6f3-4b91-899d-79bba32ba295
26 Acceleron Pharma Inc acceleron pharma ACCELERON PHARMA INC. acceleron pharma 95a8c21b-8256-4b78-9df7-71590971eff5
30 Aceto Corp aceto Aceto Corporation aceto 4236ec55-1cea-4e33-b6e7-9ee850ba117b
36 Acuson Corp acuson ACUSON acuson e8dcf764-426b-4921-a526-b189a2d335ef
53 Aegis Analytical Corp aegis analytical AEGIS ANALYTICAL CORPORATION aegis analytical 145729fe-c0af-4493-9ff6-13d29777dcdd
56 AesRx LLC aesrx AesRx LLC aesrx ec642047-4c94-490f-871b-02c9d2a150c7
62 Agencourt Bioscience Corp agencourt bioscience Agencourt Bioscience Corporation agencourt bioscience d4c083f5-faba-4c5a-9778-977ee9b14453
66 Agro Dynamics Corp agro dynamics Agro Dynamics agro dynamics 3be10411-2968-4cda-903b-d1cc5163b1a5
68 Ahura Scientific Inc ahura scientific Ahura Scientific Inc. ahura scientific 5cec5486-f446-49a4-9165-d5c29da66727
78 Alanex Corp alanex Alanex Corporation alanex 07db9c5d-8926-46be-a5ec-70ab3658b6c2
80 Albemarle Corp albemarle ALBEMARLE CORPORATION albemarle 9f083918-5ae1-4692-80a8-6023c18aca79
In [204]:
merged_df2.to_csv(os.path.join(OUTPUT_DIR, "merged_df.csv"), encoding='utf-8', index=False)
merged_df2.to_excel(os.path.join(OUTPUT_DIR, "merged_df.xlsx"), index=False)
In [ ]: