import pandas as pd import time from Bio import Entrez import xml.etree.ElementTree as ET #Optional for parsing XML def parse_pub_date(pub_date): """ Extracts and formats the publication date from the PubDate field. """ if 'Year' in pub_date: year = pub_date['Year'] month = pub_date.get('Month', '01') # Default to January if month is missing day = pub_date.get('Day', '01') # Default to the first day if day is missing return f"{year}-{month}-{day}" return "Not Available" # Set the email address for Entrez access Entrez.email = '[email protected]' # Define lists of authors and topics authors = ['Bryan Holland', 'Mehmet Oz', 'Anthony Fauci'] topics = ['RNA', 'cardiovascular', 'virus'] date_range = '("2012/03/01"[Date - Create] : "2022/12/31"[Date - Create])' # Build individual queries for each author-topic combination individual_queries = [] if authors and topics: for author in authors: for topic in topics: individual_queries.append(f'({author}[Author] AND {topic}[Title/Abstract] AND {date_range})') elif authors: for author in authors: individual_queries.append(f'({author}[Author] AND {date_range})') elif topics: for topic in topics: individual_queries.append(f'({topic}[Title/Abstract] AND {date_range})') else: individual_queries.append(f'({date_range})') # DataFrame to store the extracted data df = pd.DataFrame() # Process each query individually for query in individual_queries: handle = Entrez.esearch(db='pubmed', term=query, retmax=50) # Use Entrez.esearch to search PubMed based on the query record = Entrez.read(handle) # Read the search results from the handle id_list = record['IdList'] # Extract the list of PubMed ID (PMID) from the search results #print(query) #optional # Iterate over each PMID to fetch detailed article information for pmid in id_list: handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml') # Fetch the full article details in XML format for the given PMID records = Entrez.read(handle) # Read the detailed article records from the handle # Process each article found in the fetched XML for record in records['PubmedArticle']: #root = ET.ElementTree(ET.fromstring(Entrez.efetch(db='pubmed', id=pmid, retmode='xml').read())) #this and not 2 linrs optional for looking at XML #for elem in root.iter(): # print(elem.tag, elem.attrib, elem.text) article = record['MedlineCitation']['Article'] # Extract the article details from the 'MedlineCitation' section title = article.get('ArticleTitle', 'Title Not Available') # Safely fetch the title of the article abstract = ' '.join(article['Abstract']['AbstractText']) if 'Abstract' in article else '' # Join the text elements of the abstract into a single string, if available authors_list = ', '.join(a.get('ForeName', '') + ' ' + a.get('LastName', '') for a in article.get('AuthorList', [])) or 'Authors Not Available' # Compile a list of author names, formatted as 'First Name Last Name' journal = article['Journal'].get('Title', 'Journal Not Available') # Get the title of the journal where the article was published keywords = ', '.join(k['DescriptorName'] for k in record['MedlineCitation'].get('MeshHeadingList', [])) or 'Keywords Not Available' # Extract keywords from the 'MeshHeadingList', if available pub_date = parse_pub_date(article['Journal']['JournalIssue']['PubDate']) # Parse the publication date using a helper function url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}" # Construct the URL to the PubMed page for this article # Prepare the data row new_row = pd.DataFrame({ 'PMID': [pmid], 'Title': [title], 'Abstract': [abstract], 'Authors': [authors_list], 'Journal': [journal], 'Keywords': [keywords], 'URL': [url], 'Publication Date': [pub_date] }) df = pd.concat([df, new_row], ignore_index=True) # Add the extracted data to the DataFrame time.sleep(0.33) # Pause to respect PubMed server load # Remove duplicates based on PMID if needed df = df.drop_duplicates(subset='PMID') # Save DataFrame to an Excel file df.to_excel('PubMed_results.xlsx', index=False)