library(httr)
library(xml2)
library(dplyr)
library(rentrez)
library(openxlsx)

# Set the email address for Entrez access
Entrez.email <- '[email protected]'

authors <- c('Bryan Holland', 'Mehmet Oz', 'Anthony Fauci')
topics <- c('RNA', 'cardiovascular', 'virus')

# Build individual queries for each author-topic combination
individual_queries <- c()
if (length(authors) > 0 && length(topics) > 0) {
  for (author in authors) {
    for (topic in topics) {
      individual_queries <- c(individual_queries, sprintf('(%s[Author] AND %s[Title/Abstract])', author, topic))
    }
  }
} else if (length(authors) > 0) {
  for (author in authors) {
    individual_queries <- c(individual_queries, sprintf('(%s[Author])', author))
  }
} else if (length(topics) > 0) {
  for (topic in topics) {
    individual_queries <- c(individual_queries, sprintf('(%s[Title/Abstract])', topic))
  }
}

results_df <- data.frame()

for (query in individual_queries) {
  search_results <- entrez_search(db='pubmed', term=query, retmax=50)
  id_list <- search_results$ids
  for (pmid in id_list) {
    article_details <- entrez_fetch(db='pubmed', id=pmid, rettype='xml', retmode='text', parsed=FALSE)
    article_nodes <- read_xml(article_details)  # Use read_xml instead of xmlParse
    
    for (node in xml_find_all(article_nodes, "//PubmedArticle")) {
      title <- xml_text(xml_find_first(node, ".//ArticleTitle"))
      abstract_nodes <- xml_find_all(node, ".//AbstractText")
      abstract <- paste(sapply(abstract_nodes, xml_text), collapse=' ')
      author_nodes <- xml_find_all(node, ".//Author")
      authors_list <- paste(sapply(author_nodes, function(n) {
        forename_node <- xml_find_all(n, ".//ForeName")
        lastname_node <- xml_find_all(n, ".//LastName")
        forename <- if(length(forename_node) > 0) xml_text(forename_node[1]) else ""
        lastname <- if(length(lastname_node) > 0) xml_text(lastname_node[1]) else ""
        paste(forename, lastname)
      }), collapse=', ')
      journal <- xml_text(xml_find_first(node, ".//Journal/Title"))
      keyword_nodes <- xml_find_all(node, ".//MeshHeading/DescriptorName")
      keywords <- paste(sapply(keyword_nodes, xml_text), collapse=', ')
      url <- paste0('https://www.ncbi.nlm.nih.gov/pubmed/', pmid)
      
      results_df <- rbind(results_df, data.frame(PMID=pmid, Title=title, Abstract=abstract, Authors=authors_list, Journal=journal, Keywords=keywords, URL=url))
    }
  }
  Sys.sleep(0.33)  # Pause to respect PubMed server load
}

# Remove duplicates based on PMID if needed
results_df <- results_df %>% distinct(PMID, .keep_all = TRUE)

# Save DataFrame to an Excel file
write.xlsx(results_df, 'PubMed_results.xlsx', row.names=FALSE)