import numpy as np
from scipy.linalg import sqrtm

# Given matrix sigma inverse
sigma = np.array([[8, -2], [-2, 5]])

# Calculate the square root of the inverse of sigma
sigma_inv = np.linalg.inv(sigma)
sigma_inv

array([[0.13888889, 0.05555556],
       [0.05555556, 0.22222222]])

sigma_inv_sqrt = sqrtm(sigma_inv)

print("Square root of the inverse of sigma:")
print(sigma_inv_sqrt)

Square root of the inverse of sigma:
[[0.36666667 0.06666667]
 [0.06666667 0.46666667]]

import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import re
import os
import glob
import shutil
import openpyxl
from openpyxl import Workbook
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from pathlib import Path
from datetime import datetime, timedelta
os.getcwd()
pd.options.display.max_rows = 1000

# create the input_dir（input directory）
source_path = os.path.dirname(os.path.abspath('__file__'))
# source_path = '/scratch/bell/sido/m&a'
INPUT_DIR = os.path.join(source_path, 'data')

# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(source_path, 'outputs')

# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

# Once you run this code, comment it out
# move csv files to `data` directory(=folder)

# unique_dir_names = []
# for f in Path(f'{source_path}').rglob('*.csv'):
#     unique_dir_names.append(f)
# for g in Path(f'{source_path}').rglob('*.xlsx'):
#     unique_dir_names.append(g)

# for file in list(set(unique_dir_names)):
#     print(f'moved file: {file}')
#     shutil.move(f'{file}', f'{INPUT_DIR}')

moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/~$temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/USPTOtoCRSP_Pharma_Dictionary.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/USPTO_firm_list.xlsx
moved file: /Users/satoshiido/Documents/programming/RA/pharma/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/same_orgname_dif_assigneeid_pharma.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/duplicated_cleaned_pharma.csv
moved file: /Users/satoshiido/Documents/programming/RA/pharma/name_matching/pharma-not-in-ma-list-20230416.csv

# Pandas function to let us read csv files without having to specify the directory
def read_csv(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pd.read_csv(path, **kwrgs)

# Polars function to let us read csv files wi`thout having to specify the directory
def read_csvpl(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pl.read_csv(path, **kwrgs)

#convert xlsx to csv
# firm_dir = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "USPTO_firm_list.xlsx")))
# firm_dir.to_csv(os.path.join(INPUT_DIR, "uspto_firm_list.csv"), encoding='utf-8', index=False)
# temproster = pd.DataFrame(pd.read_excel(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.xlsx")))
# temproster.to_csv(os.path.join(INPUT_DIR, "temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv"), encoding='utf-8', index=False)

firm_dir = read_csv('uspto_firm_list')
temproster = read_csv('temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist')

Load: /Users/satoshiido/Documents/programming/RA/pharma/data/uspto_firm_list.csv
Load: /Users/satoshiido/Documents/programming/RA/pharma/data/temproster2_allversion_targetTOmissingand_recapdatanotmissingOct2023_Firmlist.csv

# Check for non-string or missing values in the 'organization' column
firm_dir[firm_dir['organization'].apply(lambda x: not isinstance(x, str))]

# fill NaN values with an empty string
firm_dir['organization'].fillna("", inplace=True)

# regular expression to remove the following words from the company name
def clean_company_name(name):
    # extend the regular expression to include 'A/S' and use a non-capturing group for postfixes
    # the regex will handle cases like "Co. Ltd." to remove following words    
    postfix_pattern = re.compile(
        r'(,?\b\s*(Inc(?=\W|$)\.?|Ltd(?=\W|$)\.?|LLC|LTD|GmbH|\& Co\.?\s*|\bCo\b\s*|\bCo\.\s*|Corp|Corporation|S\.A\.|S\.P\.A\.|S\.A\.S\.|S\.R\.L\.|LLP|LP|'
        r'S\.L\.|Oyj|Zrt|Pty|Kg|Kgaa|N\.V\.|HF|BVBA|B\.V\.|S\.E\.|PLC|PBC|S\.C\.A\.|S\.E\.M\.C\.O\.|S\.E\.C\.|'
        r'Limited).*)', re.IGNORECASE)

    # substitute found patterns with an empty string
    cleaned_name = re.sub(postfix_pattern, "", name).strip()
    cleaned_name = cleaned_name.lower()
    
    return cleaned_name

# apply the updated function to the dataframe
firm_dir["cleaned_organization"] = firm_dir["organization"].apply(clean_company_name)
temproster["cleaned_TargetName"] = temproster["TargetName"].apply(clean_company_name)

temproster.head(20)

temproster.to_csv(os.path.join(INPUT_DIR, "temproster.csv"), encoding='utf-8', index=False)
firm_dir.to_csv(os.path.join(INPUT_DIR, "firm_dir.csv"), encoding='utf-8', index=False)

# rename the new column to 'name'
temproster2 = temproster.rename(columns={"cleaned_TargetName": "name"})

temproster2

# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
    all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
    print(all_matches)
    # Filter matches based on the threshold
    # valid_matches = [match for match, score in all_matches if score >= threshold]
    return print(all_matches)

# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
    get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)

# define a function to get the best match from the list of company names in company_dir for companies names in temproster
def get_fuzzy_match(row, choices, scorer, threshold):
    all_matches = process.extractOne(row["name"], choices=choices, scorer=scorer)
    # Filter matches based on the threshold
    valid_matches = [match for match, score in all_matches if score >= threshold]
    return valid_matches

# find all matches for each company name in temproster
matches = []
for i, row in temproster2.iterrows():
    match_list = get_fuzzy_match(row, firm_dir["cleaned_organization"].tolist(), scorer=fuzz.WRatio, threshold=98)
    for match in match_list: 
        matches.append({"temproster_TargetName": row["name"], "firm_dir_organization": match, "firm_dir_assignee_id": firm_dir[firm_dir["cleaned_organization"] == match]["assignee_id"].values[0]})

# convert the list of match dictionaries to a DataFrame
matches_df = pd.DataFrame(matches)

# simple merge
merged_df = pd.merge(temproster2, firm_dir, left_on="name", right_on="cleaned_organization", how="left")

# rename the new column to 'name' and drop the data from firm_dir that was merged if it was not a match
merged_df2 = merged_df.rename(columns={"name": "cleaned_TargetName"}).dropna(subset=["cleaned_organization"])[["TargetName", "cleaned_TargetName", "organization", "cleaned_organization", "assignee_id"]]
merged_df2.head(20)

merged_df2.to_csv(os.path.join(OUTPUT_DIR, "merged_df.csv"), encoding='utf-8', index=False)
merged_df2.to_excel(os.path.join(OUTPUT_DIR, "merged_df.xlsx"), index=False)

	assignee_id	organization
106094	4852ee47-e206-4b18-8fbd-3027c7020c3c	NaN
279157	be8d184f-ad1e-4cce-ba25-9588bf36c054	NaN
326732	ded97c9f-f562-49f6-bbfa-7591b6c5d6d1	NaN

	TargetName	cleaned_TargetName
0	3D Systems Corp	3d systems
1	AB SCIEX	ab sciex
2	ACT Gene SA	act gene sa
3	ADAC Laboratories	adac laboratories
4	AECI Ltd	aeci
5	AEG Elektrofotografie GmbH	aeg elektrofotografie
6	AFx Inc	afx
7	ALIGN Pharmaceuticals LLC	align pharmaceuticals
8	ALZA Corp	alza
9	AMAG Pharmaceuticals Inc	amag pharmaceuticals
10	AMICAS Inc	amicas
11	AMRESCO Inc	amresco
12	ANC LLC	anc
13	APEIRON Biologics AG	apeiron biologics ag
14	APEX Intl Clinical Research Co	apex intl clinical research
15	ARIAD Gene Therapeutics Inc	ariad gene therapeutics
16	ATMI Inc	atmi
17	AbVitro Inc	abvitro
18	Abaxis Inc	abaxis
19	AbbVie Inc	abbvie

	TargetName	name
0	3D Systems Corp	3d systems
1	AB SCIEX	ab sciex
2	ACT Gene SA	act gene sa
3	ADAC Laboratories	adac laboratories
4	AECI Ltd	aeci
...	...	...
1473	iviGene Corp	ivigene
1474	microcuff GmbH	microcuff
1475	nanoTechnology Systems Pty Ltd	nanotechnology systems
1476	picoSpin LLC	picospin
1477	superDimension Ltd	superdimension

	TargetName	cleaned_TargetName	organization	cleaned_organization	assignee_id
1	AB SCIEX	ab sciex	AB Sciex LLC	ab sciex	52baa10b-d9bd-4fc7-88ad-3e25d4de3141
4	AECI Ltd	aeci	AECI Limited	aeci	f8bec60a-cb40-4009-837b-f5d557b1e434
5	AEG Elektrofotografie GmbH	aeg elektrofotografie	AEG Elektrofotografie GmbH	aeg elektrofotografie	e2d3a3be-7f93-4e6a-8921-04992b68e048
6	AFx Inc	afx	AFX Inc.	afx	b57b3456-5dc2-4f40-8be0-089cdbd2b027
8	ALZA Corp	alza	ALZA CORPORATION	alza	aad595fd-89b4-4aa0-8992-0d36cadf5ac0
11	AMRESCO Inc	amresco	Amresco Inc.	amresco	89025abc-6c63-4c79-9fc3-64785dfba78f
13	APEIRON Biologics AG	apeiron biologics ag	APEIRON BIOLOGICS AG	apeiron biologics ag	41af2c4a-5a82-44fa-83fb-2997036b4bca
16	ATMI Inc	atmi	ATMI BVBA	atmi	470c6c47-1290-464e-b289-449b3e9defa4
17	AbVitro Inc	abvitro	AbVitro LLC	abvitro	1e916abd-b65b-486d-a72d-000b8023748e
19	AbbVie Inc	abbvie	AbbVie Inc.	abbvie	e047aabc-b6f3-4b91-899d-79bba32ba295
26	Acceleron Pharma Inc	acceleron pharma	ACCELERON PHARMA INC.	acceleron pharma	95a8c21b-8256-4b78-9df7-71590971eff5
30	Aceto Corp	aceto	Aceto Corporation	aceto	4236ec55-1cea-4e33-b6e7-9ee850ba117b
36	Acuson Corp	acuson	ACUSON	acuson	e8dcf764-426b-4921-a526-b189a2d335ef
53	Aegis Analytical Corp	aegis analytical	AEGIS ANALYTICAL CORPORATION	aegis analytical	145729fe-c0af-4493-9ff6-13d29777dcdd
56	AesRx LLC	aesrx	AesRx LLC	aesrx	ec642047-4c94-490f-871b-02c9d2a150c7
62	Agencourt Bioscience Corp	agencourt bioscience	Agencourt Bioscience Corporation	agencourt bioscience	d4c083f5-faba-4c5a-9778-977ee9b14453
66	Agro Dynamics Corp	agro dynamics	Agro Dynamics	agro dynamics	3be10411-2968-4cda-903b-d1cc5163b1a5
68	Ahura Scientific Inc	ahura scientific	Ahura Scientific Inc.	ahura scientific	5cec5486-f446-49a4-9165-d5c29da66727
78	Alanex Corp	alanex	Alanex Corporation	alanex	07db9c5d-8926-46be-a5ec-70ab3658b6c2
80	Albemarle Corp	albemarle	ALBEMARLE CORPORATION	albemarle	9f083918-5ae1-4692-80a8-6023c18aca79

DATA IMPORT¶

PREPROCESS¶

MATCHING¶

Fuzzy matching¶

Exact matching¶