MS Proteomics_STRING R human Analysis

PHOTO EMBED

Wed May 06 2026 20:52:41 GMT+0000 (Coordinated Universal Time)

Saved by @1234_5

# ===============================
# STRING analysis for HEK
# Reproducible + comparable pipeline
# ===============================

library(STRINGdb)
library(dplyr)
library(readr)
library(igraph)

# -------------------------------
# 1. Load data
# -------------------------------
HEK <- read_csv(
  "HEK_filtered_BFDR0.05_Saint0.8_Spec2.csv",
  show_col_types = FALSE
)

# Extract unique genes
genes <- unique(HEK$PreyGene)
gene_df <- data.frame(gene = genes)

# -------------------------------
# 2. Initialize STRING (HUMAN)
# -------------------------------
string_db <- STRINGdb$new(
  version = "11.5",
  species = 9606,
  score_threshold = 400   # MATCH JW WT for comparability
)

# -------------------------------
# 3. Map genes
# -------------------------------
mapped <- string_db$map(
  gene_df,
  "gene",
  removeUnmappedRows = TRUE,
  takeFirst = TRUE
)

cat("Mapped proteins:", length(unique(mapped$STRING_id)), "\n")

hits <- mapped$STRING_id

# -------------------------------
# 4. Get interactions
# -------------------------------
network <- string_db$get_interactions(hits)

# -------------------------------
# 5. IMPORTANT: deduplicate edges
# (this matches your corrected JW pipeline)
# -------------------------------
network_unique <- network %>%
  mutate(pair = ifelse(from < to,
                       paste(from, to, sep = "_"),
                       paste(to, from, sep = "_"))) %>%
  distinct(pair, .keep_all = TRUE)

cat("Unique interactions:", nrow(network_unique), "\n")

# -------------------------------
# 6. Save outputs
# -------------------------------
write.csv(
  network_unique,
  "HEK_STRING_network_REPRODUCED.csv",
  row.names = FALSE
)

write.csv(
  mapped,
  "HEK_STRING_nodes_REPRODUCED.csv",
  row.names = FALSE
)

# -------------------------------
# 7. Plot network
# -------------------------------
string_db$plot_network(hits)

# -------------------------------
# 8. Sanity checks
# -------------------------------
cat("Unique proteins:", length(unique(mapped$STRING_id)), "\n")
content_copyCOPY