import pandas as pd
import time
from Bio import Entrez
import xml.etree.ElementTree as ET #Optional for parsing XML

def parse_pub_date(pub_date):
    """ Extracts and formats the publication date from the PubDate field. """
    if 'Year' in pub_date:
        year = pub_date['Year']
        month = pub_date.get('Month', '01')  # Default to January if month is missing
        day = pub_date.get('Day', '01')  # Default to the first day if day is missing
        return f"{year}-{month}-{day}"
    return "Not Available"

# Set the email address for Entrez access
Entrez.email = '[email protected]'

# Define lists of authors and topics
authors = ['Bryan Holland', 'Mehmet Oz', 'Anthony Fauci']
topics = ['RNA', 'cardiovascular', 'virus']
date_range = '("2012/03/01"[Date - Create] : "2022/12/31"[Date - Create])'

# Build individual queries for each author-topic combination
individual_queries = []
if authors and topics:
    for author in authors:
        for topic in topics:
            individual_queries.append(f'({author}[Author] AND {topic}[Title/Abstract] AND {date_range})')
elif authors:
    for author in authors:
        individual_queries.append(f'({author}[Author] AND {date_range})')
elif topics:
    for topic in topics:
        individual_queries.append(f'({topic}[Title/Abstract] AND {date_range})')
else:
    individual_queries.append(f'({date_range})')

# DataFrame to store the extracted data
df = pd.DataFrame()

# Process each query individually
for query in individual_queries: 
    handle = Entrez.esearch(db='pubmed', term=query, retmax=50)  # Use Entrez.esearch to search PubMed based on the query
    record = Entrez.read(handle)  # Read the search results from the handle
    id_list = record['IdList']  # Extract the list of PubMed ID (PMID) from the search results
    #print(query) #optional
    # Iterate over each PMID to fetch detailed article information
    for pmid in id_list:
        handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')  # Fetch the full article details in XML format for the given PMID
        records = Entrez.read(handle)  # Read the detailed article records from the handle
        
        # Process each article found in the fetched XML
        for record in records['PubmedArticle']:
            #root = ET.ElementTree(ET.fromstring(Entrez.efetch(db='pubmed', id=pmid, retmode='xml').read())) #this and not 2 linrs optional for looking at XML
            #for elem in root.iter(): 
            #    print(elem.tag, elem.attrib, elem.text)
            article = record['MedlineCitation']['Article']  # Extract the article details from the 'MedlineCitation' section
            
            title = article.get('ArticleTitle', 'Title Not Available')  # Safely fetch the title of the article
            abstract = ' '.join(article['Abstract']['AbstractText']) if 'Abstract' in article else ''  # Join the text elements of the abstract into a single string, if available
            authors_list = ', '.join(a.get('ForeName', '') + ' ' + a.get('LastName', '') for a in article.get('AuthorList', [])) or 'Authors Not Available'  # Compile a list of author names, formatted as 'First Name Last Name'
            journal = article['Journal'].get('Title', 'Journal Not Available')  # Get the title of the journal where the article was published
            keywords = ', '.join(k['DescriptorName'] for k in record['MedlineCitation'].get('MeshHeadingList', [])) or 'Keywords Not Available'  # Extract keywords from the 'MeshHeadingList', if available
            pub_date = parse_pub_date(article['Journal']['JournalIssue']['PubDate'])  # Parse the publication date using a helper function
            url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"  # Construct the URL to the PubMed page for this article

            # Prepare the data row
            new_row = pd.DataFrame({
                'PMID': [pmid],
                'Title': [title],
                'Abstract': [abstract],
                'Authors': [authors_list],
                'Journal': [journal],
                'Keywords': [keywords],
                'URL': [url],
                'Publication Date': [pub_date]
            })
    
            df = pd.concat([df, new_row], ignore_index=True)  # Add the extracted data to the DataFrame

        time.sleep(0.33)  # Pause to respect PubMed server load

# Remove duplicates based on PMID if needed
df = df.drop_duplicates(subset='PMID')

# Save DataFrame to an Excel file
df.to_excel('PubMed_results.xlsx', index=False)