Category Similarity Embadding code

PHOTO

Fri Mar 14 2025 09:44:41 GMT+0000 (Coordinated Universal Time)

Saved by @piyushkumar121 #python

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from typing import Dict, List, Tuple, Set
import logging
from ..models.text_embedder import TextEmbedder
from ..database.db_connector import DatabaseConnector

logger = logging.getLogger(__name__)

class SimilarityScorer:
    def __init__(self, config: Dict):
        self.config = config
        self.similarity_weights = config['similarity_weights']
        self.related_categories = {
            k: set(v) for k, v in config['category_relationships'].items()
        }
        self.related_audiences = {
            k: set(v) for k, v in config['audience_relationships'].items()
        }
        self.scaler = MinMaxScaler()
        
        # Initialize the text embedder
        self.text_embedder = TextEmbedder(
            gemini_api_key=config['text_embedding'].get('gemini_api_key'), 
            pinecone_config={
                'api_key': config.get('pinecone', {}).get('api_key', ''),
                'index_name': config.get('pinecone', {}).get('index_name', 'recommendationsystempro'),
                'namespace': config.get('pinecone', {}).get('namespace', 'influencer-matching')
            }
        )
        
        # Initialize database connector if database config exists
        self.db_connector = None
        if 'database' in self.config:
            try:
                self.db_connector = DatabaseConnector(self.config)
            except Exception as e:
                logger.warning(f"Could not initialize database connection: {str(e)}")

    def _get_related_categories(self, category: str) -> Set[str]:
        category = category.lower()
        for main_cat, related in self.related_categories.items():
            if category in related or category == main_cat:
                return related | {main_cat}
        return set()

    def _calculate_category_similarity_embedding(self, brand: pd.Series, influencer: pd.Series) -> float:
        try:
            # Extract category-related information
            brand_industry = str(brand.get('industry', '')).lower()
            brand_alignment = str(brand.get('category_alignment', '')).lower()
            influencer_niche = str(influencer.get('category_niche', '')).lower()
            
            # Combine the category data with descriptive context
            brand_category_text = f"Brand industry: {brand_industry}. Brand category alignment: {brand_alignment}"
            influencer_category_text = f"Influencer category/niche: {influencer_niche}"
            
            # Use the text embedder to get embedding vectors
            brand_embedding = self.text_embedder.get_embedding(brand_category_text)
            influencer_embedding = self.text_embedder.get_embedding(influencer_category_text)
            
            # Calculate cosine similarity between the embedding vectors
            similarity = cosine_similarity(
                brand_embedding.reshape(1, -1),
                influencer_embedding.reshape(1, -1)
            )[0][0]
            
            # Apply a power transformation to enhance differentiation between scores
            # This gives more weight to higher similarities
            adjusted_similarity = similarity ** 0.7
            
            logger.info(f"Embedding-based category similarity score: {adjusted_similarity:.2f} for {brand_industry}/{brand_alignment} -> {influencer_niche}")
            return float(adjusted_similarity)
                
        except Exception as e:
            logger.warning(f"Error using embeddings for category similarity: {str(e)}, falling back to rule-based method")
            return self._calculate_category_similarity_rule_based(brand, influencer)

    def _calculate_category_similarity_rule_based(self, brand: pd.Series, influencer: pd.Series) -> float:
        brand_categories = set(str(brand.get('industry', '')).lower().split('/'))
        brand_alignment = set(str(brand.get('category_alignment', '')).lower().split('/'))
        influencer_categories = set(str(influencer.get('category_niche', '')).lower().split('/'))
        
        expanded_brand_cats = set()
        for cat in brand_categories | brand_alignment:
            expanded_brand_cats.update(self._get_related_categories(cat))
        
        expanded_influencer_cats = set()
        for cat in influencer_categories:
            expanded_influencer_cats.update(self._get_related_categories(cat))
        
        direct_matches = len(brand_categories.intersection(influencer_categories))
        alignment_matches = len(brand_alignment.intersection(influencer_categories))
        related_matches = len(expanded_brand_cats.intersection(expanded_influencer_cats))
        
        score = (
            direct_matches * 0.6 +
            alignment_matches * 0.3 +
            related_matches * 0.1
        ) / max(len(brand_categories), 1)
        
        if direct_matches == 0 and alignment_matches == 0:
            score *= 0.2
        
        return score

    def _calculate_category_similarity(self, brand: pd.Series, influencer: pd.Series) -> float:
        # Try the embedding-based approach first, fallback to rule-based if it fails
        return self._calculate_category_similarity_embedding(brand, influencer)

    def _calculate_audience_similarity(self, brand: pd.Series, influencer: pd.Series) -> float:
        brand_audience = str(brand.get('target_audience', '')).lower()
        influencer_audience = str(influencer.get('audience_demographics', '')).lower()
        
        demographic_match = float(brand_audience in influencer_audience or 
                                influencer_audience in brand_audience)
        
        related_match = 0.0
        for main_audience, related in self.related_audiences.items():
            if (brand_audience in {a.lower() for a in related | {main_audience}} and
                influencer_audience in {a.lower() for a in related | {main_audience}}):
                related_match = 0.7
                break
        
        brand_geo = str(brand.get('geographic_target', '')).lower()
        influencer_loc = str(influencer.get('location', '')).lower()
        geo_match = float(
            brand_geo in influencer_loc or
            influencer_loc in brand_geo or
            brand_geo == 'global' or
            (brand_geo == 'north america' and influencer_loc in ['usa', 'canada'])
        )
        
        brand_lang = set(str(brand.get('language_preferences', '')).lower().split('/'))
        influencer_lang = set(str(influencer.get('languages', '')).lower().split('/'))
        lang_match = len(brand_lang.intersection(influencer_lang)) / max(len(brand_lang), 1)
        
        audience_score = max(demographic_match, related_match) * 0.5 + geo_match * 0.3 + lang_match * 0.2
        
        return audience_score

    def _safe_float(self, value, default=0.0) -> float:
        try:
            result = float(value)
            return result if result != 0 else default
        except (ValueError, TypeError):
            return default

    def _safe_division(self, numerator, denominator, default=0.0) -> float:
        num = self._safe_float(numerator)
        den = self._safe_float(denominator)
        if den == 0:
            return default
        return num / den

    def _calculate_numerical_similarity(self, brand: pd.Series, influencer: pd.Series) -> float:
        scores = []
        
        min_followers = self._safe_float(brand.get('min_follower_range'), 1.0)
        actual_followers = self._safe_float(influencer.get('follower_count'), 0.0)
        if actual_followers < min_followers:
            return 0.0
        
        follower_ratio = self._safe_division(actual_followers, min_followers, 0.0)
        scores.append(min(follower_ratio, 2.0))
        
        min_engagement = self._safe_float(brand.get('min_engagement_rate'), 0.01)
        actual_engagement = self._safe_float(influencer.get('engagement_rate'), 0.0)
        if actual_engagement < min_engagement:
            return 0.0
        
        engagement_ratio = self._safe_division(actual_engagement, min_engagement, 0.0)
        scores.append(min(engagement_ratio, 2.0))
        
        posts_per_campaign = self.config['matching']['posts_per_campaign']
        campaign_budget = self._safe_float(brand.get('campaign_budget'), 0.0)
        cost_per_post = self._safe_float(influencer.get('cost_per_post'), float('inf'))
        if cost_per_post * posts_per_campaign > campaign_budget:
            return 0.0
        
        if campaign_budget > 0 and cost_per_post < float('inf'):
            budget_ratio = campaign_budget / (cost_per_post * posts_per_campaign)
            scores.append(min(budget_ratio, 2.0))
        
        if not scores:
            return 0.0
        
        average_score = np.mean(scores)
        return min(average_score, 1.0)

    def _calculate_compliance_similarity(self, brand: pd.Series, influencer: pd.Series) -> float:
        requires_controversy_free = brand.get('requires_controversy_free', False)
        controversy_flag = influencer.get('controversy_flag', True)
        compliance_status = str(influencer.get('compliance_status', '')).lower()
        
        if requires_controversy_free and controversy_flag:
            return 0.0
        
        controversy_match = not (requires_controversy_free and controversy_flag)
        compliance_match = compliance_status == 'verified'
        
        return (float(controversy_match) + float(compliance_match)) / 2

    def calculate_similarity_matrix(self, brands_features: pd.DataFrame, 
                                 influencers_features: pd.DataFrame) -> np.ndarray:
        similarity_matrix = np.zeros((len(brands_features), len(influencers_features)))
        text_similarity_matrix = np.zeros((len(brands_features), len(influencers_features)))
        
        for i, brand in brands_features.iterrows():
            brand_text = self.text_embedder.get_brand_text_features(brand)
            for j, influencer in influencers_features.iterrows():
                influencer_text = self.text_embedder.get_influencer_text_features(influencer)
                text_similarity = self.text_embedder.calculate_text_similarity(brand_text, influencer_text)
                text_similarity_matrix[brands_features.index.get_loc(i),
                                    influencers_features.index.get_loc(j)] = text_similarity

        for i, brand in brands_features.iterrows():
            for j, influencer in influencers_features.iterrows():
                category_score = self._calculate_category_similarity(brand, influencer)
                audience_score = self._calculate_audience_similarity(brand, influencer)
                numerical_score = self._calculate_numerical_similarity(brand, influencer)
                compliance_score = self._calculate_compliance_similarity(brand, influencer)
                
                traditional_score = (
                    category_score * self.similarity_weights['category'] +
                    audience_score * self.similarity_weights['audience'] +
                    numerical_score * self.similarity_weights['numerical'] +
                    compliance_score * self.similarity_weights['compliance']
                )
                
                if numerical_score == 0.0:
                    traditional_score = 0.0
                elif category_score < 0.3:
                    traditional_score *= 0.5
                
                text_score = text_similarity_matrix[brands_features.index.get_loc(i),
                                                 influencers_features.index.get_loc(j)]
                
                final_score = 0.5 * traditional_score + 0.5 * text_score
                
                similarity_matrix[brands_features.index.get_loc(i),
                                influencers_features.index.get_loc(j)] = final_score
        
        max_score = similarity_matrix.max()
        if max_score > 0:
            similarity_matrix = similarity_matrix / max_score
            similarity_matrix = np.where(similarity_matrix > 0.95, 0.95, similarity_matrix)
        
        return similarity_matrix

    def get_top_matches(self, similarity_matrix: np.ndarray,
                       brands_df: pd.DataFrame,
                       influencers_df: pd.DataFrame) -> List[Tuple[str, str, float]]:
        matches = []
        top_n = self.config['matching']['top_n']
        min_similarity = self.config['matching']['similarity_threshold']
        
        for i, brand in brands_df.iterrows():
            brand_matches = []
            for j, influencer in influencers_df.iterrows():
                category_score = self._calculate_category_similarity(brand, influencer)
                audience_score = self._calculate_audience_similarity(brand, influencer)
                numerical_score = self._calculate_numerical_similarity(brand, influencer)
                compliance_score = self._calculate_compliance_similarity(brand, influencer)
                
                traditional_score = (
                    category_score * self.similarity_weights['category'] +
                    audience_score * self.similarity_weights['audience'] +
                    numerical_score * self.similarity_weights['numerical'] +
                    compliance_score * self.similarity_weights['compliance']
                )
                
                brand_text = self.text_embedder.get_brand_text_features(brand)
                influencer_text = self.text_embedder.get_influencer_text_features(influencer)
                text_score = self.text_embedder.calculate_text_similarity(brand_text, influencer_text)
                
                final_score = 0.5 * traditional_score + 0.5 * text_score
                
                if numerical_score == 0.0:
                    final_score = 0.0
                elif category_score < self.config['matching']['min_category_score']:
                    final_score *= self.config['matching']['category_penalty']
                
                if final_score >= min_similarity:
                    brand_matches.append((
                        brand.name,
                        influencer.name,
                        round(final_score, 3)
                    ))
            
            brand_matches.sort(key=lambda x: x[2], reverse=True)
            matches.extend(brand_matches[:top_n])
        
        return matches
    
    def save_matches_to_database(self, matches: List[Tuple[str, str, float]]) -> bool:
        if not self.db_connector:
            logger.error("Database connector not available. Cannot save matches.")
            return False
        
        try:
            match_data = []
            for brand_id, influencer_id, score in matches:
                match_data.append({
                    'brand_id': brand_id,
                    'influencer_id': influencer_id,
                    'similarity_score': score
                })
            
            self.db_connector.execute_query("""
            CREATE TABLE IF NOT EXISTS matches (
                id INT AUTO_INCREMENT PRIMARY KEY,
                brand_id VARCHAR(50),
                influencer_id VARCHAR(50),
                similarity_score FLOAT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
            """)
            
            self.db_connector.insert_matches(match_data)
            
            logger.info(f"Saved {len(matches)} matches to database")
            return True
        except Exception as e:
            logger.error(f"Error saving matches to database: {str(e)}")
            return False

Save snippets that work from anywhere online with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension

Comments

More like this

Importing images from a directory (Python) to list or dictionary

from PIL import Image
import glob
image_list = []
for filename in glob.glob('yourpath/*.gif'): #assuming gif
    im=Image.open(filename)
    image_list.append(im)

python - Find out the percentage of missing values in each column in the given dataset - Stack Overflow

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

#python #python #loops #whileloop

Print the name of 7 days in a week - by using while loop

days = 0
week = [‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’, ‘Saturday’, 3.‘Sunday’]
while day < 7:
print(“Today is” + week[days])
days += 1

#javascript #python #search #historicalcode #google #algorithms

Google’s PageRank Algorithm from 1996 - the origin of internet search

import numpy as np

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    iteration = 0
    while iteration < num_iterations:
        iteration += 1
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

#python #python #strings #vowels #function

Get vowels in strings

This method gets vowels (‘a’, ‘e’, ‘i’, ‘o’, ‘u’) found in a string.
   
#make a function:
def get_vowels(string):

#return is the keyword which means function have to return value: 
 return [each for each in string if each in 'aeiou']


#assign the words and function will return vowels words.
get_vowels('foobar') # ['o', 'o', 'a']


get_vowels('gym') # []

Getting the index of an item in a list containing it in Python

>>> ["foo", "bar", "baz"].index("bar")
1

Could not build wheels for tokenizers which use PEP 517 and cannot be installed directly

https://github.com/pydata/bottleneck/issues/281

How To Bypass Cloudflare Bot Protection In Selenium - CodingTutz

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

Python Loop through Excel sheets, place into one df - Stack Overflow

import pandas as pd

sheets_dict = pd.read_excel('Book1.xlsx', sheetname=None)

full_table = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['sheet'] = name
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    full_table = full_table.append(sheet)

full_table.reset_index(inplace=True, drop=True)

print full_table

#python #dates #functions #python3.8

How to parse a String into Datetime in Python

from datetime import datetime

datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

python - Way to change Google Chrome user agent in Selenium? - Stack Overflow

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

options = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get("https://www.google.co.in")
driver.quit()

python - How to see the progress bar of read_csv - Stack Overflow

def read_csv_pgbar(csv_path, chunksize, usecols, dtype=object):


    # print('Getting row count of csv file')

    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    # chunks = rows//chunksize + 1
    # print('Reading csv file')
    chunk_list = []

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=usecols, dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print('Finish reading csv file')

    return df

#python #python #lists #dictionary

Convert two lists into a dictionary

keys, values)) # {'a': 2, 'c': 4, 'b': 3}
 
 
#make a function: def is the keyword for the function:
def to_dictionary(keys, values):
 
 
#return is the keyword that tells program that function has to return value   
return dict(zip(keys, values))
 
  
 
# keys and values are the lists:
 
keys = ["a", "b", "c"]   
 
values = [2, 3, 4]

Check String contains Substring Method

if "blah" not in somestring: 
    continue

#python #interesting #arrays #sorting #interviewquestions

Sorting an array without changing position of negative numbers

# Python3 implementation of the approach 

# Function to sort the array such that 
# negative values do not get affected 
def sortArray(a, n): 

	# Store all non-negative values 
	ans=[] 
	for i in range(n): 
		if (a[i] >= 0): 
			ans.append(a[i]) 

	# Sort non-negative values 
	ans = sorted(ans) 

	j = 0
	for i in range(n): 

		# If current element is non-negative then 
		# update it such that all the 
		# non-negative values are sorted 
		if (a[i] >= 0): 
			a[i] = ans[j] 
			j += 1

	# Print the sorted array 
	for i in range(n): 
		print(a[i],end = " ") 


# Driver code 

arr = [2, -6, -3, 8, 4, 1] 

n = len(arr) 

sortArray(arr, n)

Browse more snippets >>