library(httr) library(xml2) library(dplyr) library(rentrez) library(openxlsx) # Set the email address for Entrez access Entrez.email <- '[email protected]' authors <- c('Bryan Holland', 'Mehmet Oz', 'Anthony Fauci') topics <- c('RNA', 'cardiovascular', 'virus') # Build individual queries for each author-topic combination individual_queries <- c() if (length(authors) > 0 && length(topics) > 0) { for (author in authors) { for (topic in topics) { individual_queries <- c(individual_queries, sprintf('(%s[Author] AND %s[Title/Abstract])', author, topic)) } } } else if (length(authors) > 0) { for (author in authors) { individual_queries <- c(individual_queries, sprintf('(%s[Author])', author)) } } else if (length(topics) > 0) { for (topic in topics) { individual_queries <- c(individual_queries, sprintf('(%s[Title/Abstract])', topic)) } } results_df <- data.frame() for (query in individual_queries) { search_results <- entrez_search(db='pubmed', term=query, retmax=50) id_list <- search_results$ids for (pmid in id_list) { article_details <- entrez_fetch(db='pubmed', id=pmid, rettype='xml', retmode='text', parsed=FALSE) article_nodes <- read_xml(article_details) # Use read_xml instead of xmlParse for (node in xml_find_all(article_nodes, "//PubmedArticle")) { title <- xml_text(xml_find_first(node, ".//ArticleTitle")) abstract_nodes <- xml_find_all(node, ".//AbstractText") abstract <- paste(sapply(abstract_nodes, xml_text), collapse=' ') author_nodes <- xml_find_all(node, ".//Author") authors_list <- paste(sapply(author_nodes, function(n) { forename_node <- xml_find_all(n, ".//ForeName") lastname_node <- xml_find_all(n, ".//LastName") forename <- if(length(forename_node) > 0) xml_text(forename_node[1]) else "" lastname <- if(length(lastname_node) > 0) xml_text(lastname_node[1]) else "" paste(forename, lastname) }), collapse=', ') journal <- xml_text(xml_find_first(node, ".//Journal/Title")) keyword_nodes <- xml_find_all(node, ".//MeshHeading/DescriptorName") keywords <- paste(sapply(keyword_nodes, xml_text), collapse=', ') url <- paste0('https://www.ncbi.nlm.nih.gov/pubmed/', pmid) results_df <- rbind(results_df, data.frame(PMID=pmid, Title=title, Abstract=abstract, Authors=authors_list, Journal=journal, Keywords=keywords, URL=url)) } } Sys.sleep(0.33) # Pause to respect PubMed server load } # Remove duplicates based on PMID if needed results_df <- results_df %>% distinct(PMID, .keep_all = TRUE) # Save DataFrame to an Excel file write.xlsx(results_df, 'PubMed_results.xlsx', row.names=FALSE)