Spaces:

jpwahle
/

field-time-diversity

Sleeping

App Files Files Community

jpwahle commited on Sep 20, 2023

Commit

7574c0c

•

1 Parent(s): 8b8e2ce

Finalize PDF function and update on hf-hub

Browse files

Files changed (5) hide show

Dockerfile +5 -0
main.py +14 -5
pdf.py +14 -0
plots.py +61 -21
s2.py +237 -92

Dockerfile CHANGED Viewed

@@ -14,6 +14,8 @@ RUN mkdir -p /var/run/supervisor && chmod 777 /var/run/supervisor
 # Install supervisord and python (for gradio)
 RUN apt-get update && apt-get install -y supervisor python3 python3-pip && rm -rf /var/lib/apt/lists/*
 RUN pip3 install gradio
 # Copy your gradio app to the image
 COPY . /app/
@@ -22,6 +24,9 @@ COPY ./data /app/data
 # Install gradio
 RUN pip3 install -r /app/requirements.txt
 # Supervisord configuration
 RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
     echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \

 # Install supervisord and python (for gradio)
 RUN apt-get update && apt-get install -y supervisor python3 python3-pip && rm -rf /var/lib/apt/lists/*
 RUN pip3 install gradio
+RUN pip3 install git+https://github.com/kermitt2/grobid_client_python
+RUN pip3 install git+https://github.com/titipata/scipdf_parser
 # Copy your gradio app to the image
 COPY . /app/
 # Install gradio
 RUN pip3 install -r /app/requirements.txt
+# Download spacy en_core_web_sm
+RUN python3 -m spacy download en_core_web_sm
 # Supervisord configuration
 RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
     echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \

main.py CHANGED Viewed

@@ -14,6 +14,7 @@ from s2 import (
     compute_stats_for_acl_author,
     compute_stats_for_acl_paper,
     compute_stats_for_acl_venue,
     compute_stats_for_s2_author,
     compute_stats_for_s2_paper,
 )
@@ -35,25 +36,32 @@ def create_compute_stats(submit_type=None):
             id_type, author_name = check_s2_id_type(s2_id)
             if id_type == "paper":
                 results = compute_stats_for_s2_paper(s2_id)
                 return plot_and_return_stats(*results)
             if id_type == "author":
                 results = compute_stats_for_s2_author(s2_id, author_name)
                 return plot_and_return_stats(*results)
         if submit_type == "acl_link" and acl_link:
             # Crawl all papers for the author or venue or just the paper if it is a paper link
             url_type = determine_page_type(acl_link)
             if url_type == "paper":
                 results = compute_stats_for_acl_paper(acl_link)
                 return plot_and_return_stats(*results)
             if url_type == "author":
                 results = compute_stats_for_acl_author(acl_link)
                 return plot_and_return_stats(*results)
             if url_type == "venue":
                 results = compute_stats_for_acl_venue(acl_link)
                 return plot_and_return_stats(*results)
-        # if submit_type == "pdf_file" and pdf_file:
-        #     # Compute the citation field diversity index and citation age diversity index
-        #     pass
         return None, None, None, None, None, None, None, None
     return compute_stats
@@ -67,6 +75,7 @@ def plot_and_return_stats(
     cfdi,
     cadi,
     maoc,
 ):
     """
     Plots the data and returns statistics.
@@ -85,10 +94,10 @@ def plot_and_return_stats(
         the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
     """
     # Generate cfdi plot
-    plot_cfdi = generate_cfdi_plot(cfdi)
     # Generate cadi plot
-    plot_maoc = generate_maoc_plot(maoc)
     # Get top 3 most cited fields
     top_fields_text = "\n".join(

     compute_stats_for_acl_author,
     compute_stats_for_acl_paper,
     compute_stats_for_acl_venue,
+    compute_stats_for_pdf,
     compute_stats_for_s2_author,
     compute_stats_for_s2_paper,
 )
             id_type, author_name = check_s2_id_type(s2_id)
             if id_type == "paper":
                 results = compute_stats_for_s2_paper(s2_id)
+                results = results + ("paper",)
                 return plot_and_return_stats(*results)
             if id_type == "author":
                 results = compute_stats_for_s2_author(s2_id, author_name)
+                results = results + ("author",)
                 return plot_and_return_stats(*results)
         if submit_type == "acl_link" and acl_link:
             # Crawl all papers for the author or venue or just the paper if it is a paper link
             url_type = determine_page_type(acl_link)
             if url_type == "paper":
                 results = compute_stats_for_acl_paper(acl_link)
+                results = results + ("paper",)
                 return plot_and_return_stats(*results)
             if url_type == "author":
                 results = compute_stats_for_acl_author(acl_link)
+                results = results + ("author",)
                 return plot_and_return_stats(*results)
             if url_type == "venue":
                 results = compute_stats_for_acl_venue(acl_link)
+                results = results + ("proceedings",)
                 return plot_and_return_stats(*results)
+        if submit_type == "pdf_file" and pdf_file:
+            # Compute the citation field diversity index and citation age diversity index
+            results = asyncio.run(compute_stats_for_pdf(pdf_file))
+            results = results + ("paper",)
+            return plot_and_return_stats(*results)
         return None, None, None, None, None, None, None, None
     return compute_stats
     cfdi,
     cadi,
     maoc,
+    compute_type,
 ):
     """
     Plots the data and returns statistics.
         the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
     """
     # Generate cfdi plot
+    plot_cfdi = generate_cfdi_plot(cfdi, compute_type)
     # Generate cadi plot
+    plot_maoc = generate_maoc_plot(maoc, compute_type)
     # Get top 3 most cited fields
     top_fields_text = "\n".join(

pdf.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import scipdf
+def parse_pdf_to_artcile_dict(pdf_path):
+    return scipdf.parse_pdf_to_dict(pdf_path)
+if __name__ == "__main__":
+    article_dict = scipdf.parse_pdf_to_dict(
+        "/Users/jp/Documents/papers/demo-test/EMNLP23_Influence_NLP_Citation_Analysis.pdf"
+    )  # return dictionary
+    print(article_dict.keys())
+    print(article_dict["title"])
+    print(article_dict["references"][0].keys())

plots.py CHANGED Viewed

@@ -33,7 +33,7 @@ with open(
         mean_citation_ages.append(temp)
-def generate_cfdi_plot(input_cfdi):
     """
     Function to generate a plot for CFDI
     """
@@ -56,20 +56,40 @@ def generate_cfdi_plot(input_cfdi):
         interpolated_y_cfdi,
         c="r",
         marker="*",
-        linewidths=1,
         zorder=2,
     )
     ax.vlines(
-        input_cfdi, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5
     )
     epsilon = 0.005
-    # ax.text(
-    #     input_cfdi + epsilon,
-    #     interpolated_y_cfdi + epsilon,
-    #     "Your paper",
-    #     {"color": "#DC143C", "fontsize": 13},
-    #     ha="left",  # Horizontal alignment
-    # )
     ax.set_xlabel("Citation Field Diversity Index (CFDI)", fontsize=15)
     ax.set_ylabel("Density", fontsize=15)
@@ -78,9 +98,9 @@ def generate_cfdi_plot(input_cfdi):
     return fig
-def generate_maoc_plot(input_maoc):
     """
-    Function to generate a plot for CFDI
     """
     # Using kdeplot to fill the distribution curve
     sns.set(font_scale=1.3, style="whitegrid")
@@ -100,20 +120,40 @@ def generate_maoc_plot(input_maoc):
         interpolated_y_cfdi,
         c="r",
         marker="*",
-        linewidths=1,
         zorder=2,
     )
     ax.vlines(
-        input_maoc, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5
     )
     epsilon = 0.005
-    # ax.text(
-    #     input_maoc + epsilon,
-    #     interpolated_y_cfdi + epsilon,
-    #     "Your paper",
-    #     {"color": "#DC143C", "fontsize": 13},
-    #     ha="left",  # Horizontal alignment
-    # )
     ax.set_xlabel("Mean Age of Citation (mAoC)", fontsize=15)
     ax.set_ylabel("Density", fontsize=15)

         mean_citation_ages.append(temp)
+def generate_cfdi_plot(input_cfdi, compute_type="paper"):
     """
     Function to generate a plot for CFDI
     """
         interpolated_y_cfdi,
         c="r",
         marker="*",
+        linewidths=2,
         zorder=2,
+        s=32,
     )
     ax.vlines(
+        input_cfdi,
+        0,
+        interpolated_y_cfdi,
+        color="tomato",
+        ls="--",
+        lw=1.5,
     )
     epsilon = 0.005
+    # Compute the average and plot it as a light grey vertical line
+    mean_val = np.mean(data)
+    # Interpolate the y value for the mean
+    interpolated_y_mean = np.interp(mean_val, x_vals, y_vals)
+    ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5)
+    ax.text(
+        mean_val + epsilon,
+        interpolated_y_mean + epsilon,
+        "Avg.",
+        {"color": "grey", "fontsize": 13},
+        ha="left",  # Horizontal alignment
+    )
+    ax.text(
+        input_cfdi + epsilon,
+        interpolated_y_cfdi + epsilon,
+        f"This {compute_type}",
+        {"color": "#DC143C", "fontsize": 13},
+        ha="left",  # Horizontal alignment
+    )
     ax.set_xlabel("Citation Field Diversity Index (CFDI)", fontsize=15)
     ax.set_ylabel("Density", fontsize=15)
     return fig
+def generate_maoc_plot(input_maoc, compute_type="paper"):
     """
+    Function to generate a plot for MAOC
     """
     # Using kdeplot to fill the distribution curve
     sns.set(font_scale=1.3, style="whitegrid")
         interpolated_y_cfdi,
         c="r",
         marker="*",
+        linewidths=2,
         zorder=2,
+        s=32,
     )
     ax.vlines(
+        input_maoc,
+        0,
+        interpolated_y_cfdi,
+        color="tomato",
+        ls="--",
+        lw=1.5,
     )
     epsilon = 0.005
+    # Compute the average and plot it as a light grey vertical line
+    mean_val = np.mean(data)
+    # Interpolate the y value for the mean
+    interpolated_y_mean = np.interp(mean_val, x_vals, y_vals)
+    ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5)
+    ax.text(
+        mean_val + epsilon,
+        interpolated_y_mean + epsilon,
+        "Avg.",
+        {"color": "grey", "fontsize": 13},
+        ha="left",  # Horizontal alignment
+    )
+    ax.text(
+        input_maoc + epsilon,
+        interpolated_y_cfdi + epsilon,
+        f"This {compute_type}",
+        {"color": "#DC143C", "fontsize": 13},
+        ha="left",  # Horizontal alignment
+    )
     ax.set_xlabel("Mean Age of Citation (mAoC)", fontsize=15)
     ax.set_ylabel("Density", fontsize=15)

s2.py CHANGED Viewed

@@ -1,11 +1,15 @@
 # Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
 # All rights reserved.
 import asyncio
 import os
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import requests
 from aclanthology import (
@@ -15,9 +19,16 @@ from aclanthology import (
     extract_venue_info,
 )
 from metrics import calculate_gini, calculate_gini_simpson
 def get_or_create_eventloop():
     try:
         return asyncio.get_event_loop()
     except RuntimeError as ex:
@@ -56,12 +67,10 @@ def check_s2_id_type(semantic_scholar_id):
         the name of the author (if the ID is valid for an author), or "invalid"
         if the ID is not valid for either a paper or an author.
     """
-    # Define the base URL for Semantic Scholar API
-    base_url = "https://api.semanticscholar.org/v1/"
     # First, check if it's a paper ID
     paper_response = requests.get(
-        f"{base_url}paper/{semantic_scholar_id}", timeout=5
     )
     # If the response status code is 200, it means the ID is valid for a paper
@@ -70,7 +79,8 @@ def check_s2_id_type(semantic_scholar_id):
     # Next, check if it's an author ID
     author_response = requests.get(
-        f"{base_url}author/{semantic_scholar_id}", timeout=5
     )
     # If the response status code is 200, it means the ID is valid for an author
@@ -101,6 +111,115 @@ def get_papers_from_author(ssid_author_id):
     return []
 def compute_stats_for_s2_paper(ssid_paper_id):
     """
     Computes statistics for a given paper ID using the Semantic Scholar API.
@@ -143,87 +262,14 @@ def compute_stats_for_s2_paper(ssid_paper_id):
             title + "\n" + ", ".join([author["name"] for author in authors])
         )
-        # Go over the references of the paper
-        reference_year_list = []
-        reference_title_list = []
-        reference_fos_list = []
-        with ThreadPoolExecutor() as executor:
-            request_url_refs = [
-                f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy"
-                for ref_paper_key in filtered_s2_ref_paper_keys
-            ]
-            futures = [
-                executor.submit(send_s2_request, request_url_ref)
-                for request_url_ref in request_url_refs
-            ]
-            for future in as_completed(futures):
-                r_ref = future.result()
-                if r_ref.status_code == 200:
-                    result_ref = r_ref.json()
-                    (title_ref, year_ref, fields_ref) = (
-                        result_ref["title"],
-                        result_ref["year"],
-                        result_ref["s2FieldsOfStudy"],
-                    )
-                    reference_year_list.append(year_ref)
-                    reference_title_list.append(title_ref)
-                    reference_fos_list.extend(
-                        field["category"]
-                        for field in fields_ref
-                        if field["source"] == "s2-fos-model"
-                    )
-                else:
-                    print(
-                        f"Error retrieving reference {r_ref.status_code} for"
-                        f" paper {ssid_paper_id}"
-                    )
-        # Remove all None from reference_year_list and reference_title_list
-        reference_year_list = [
-            year_ref
-            for year_ref in reference_year_list
-            if year_ref is not None
-        ]
-        reference_title_list = [
-            title_ref
-            for title_ref in reference_title_list
-            if title_ref is not None
-        ]
-        # Count references
-        num_references = len(reference_year_list)
-        # Flatten list and count occurrences
-        fields_of_study_counts = dict(
-            Counter(
-                [
-                    field
-                    for field in reference_fos_list
-                    if "Computer Science" not in field
-                ]
-            )
-        )
-        # Citation age list
-        aoc_list = [
-            year - year_ref
-            for year_ref in reference_year_list
-            if year_ref and year
-        ]
-        if not aoc_list:
-            return None, None, None, None, None, None, None, None
-        # Compute citation age
-        output_maoc = sum(aoc_list) / len(aoc_list)
-        cadi = calculate_gini(aoc_list)
-        # Create a dictionary of year to title
-        year_to_title_dict = dict(
-            zip(reference_year_list, reference_title_list)
-        )
-        # Compute CFDI
-        cfdi = calculate_gini_simpson(fields_of_study_counts)
         # Return the results
         return (
@@ -273,9 +319,6 @@ def compute_stats_for_acl_paper(url):
     return None
-import asyncio
 def compute_stats_for_acl_author(url):
     """
     Computes statistics for an author's papers in the ACL anthology.
@@ -303,6 +346,15 @@ def compute_stats_for_acl_author(url):
 def compute_stats_for_acl_venue(url):
     if paper_info := extract_venue_info(url):
         loop = get_or_create_eventloop()
         tasks = [
@@ -317,7 +369,26 @@ def compute_stats_for_acl_venue(url):
     return None
-def compute_stats_for_multiple_s2_papers(papers, title):
     num_references = 0
     top_fields = {}
     oldest_paper_dict = {}
@@ -337,8 +408,8 @@ def compute_stats_for_multiple_s2_papers(papers, title):
         num_references += results[1]
         for field, count in results[2].items():
             top_fields[field] = top_fields.get(field, 0) + count
-        for year, title in results[3].items():
-            oldest_paper_dict[year] = title
         cfdi += results[4]
         cadi += results[5]
         output_maoc += results[6]
@@ -352,3 +423,77 @@ def compute_stats_for_multiple_s2_papers(papers, title):
         cadi / len(papers),
         output_maoc / len(papers),
     )

 # Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
 # All rights reserved.
 import asyncio
+import datetime
 import os
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Tuple
+import aiohttp
 import requests
 from aclanthology import (
     extract_venue_info,
 )
 from metrics import calculate_gini, calculate_gini_simpson
+from pdf import parse_pdf_to_artcile_dict
 def get_or_create_eventloop():
+    """
+    Get the current event loop or create a new one if there is no current event loop in the thread.
+    Returns:
+        The current event loop.
+    """
     try:
         return asyncio.get_event_loop()
     except RuntimeError as ex:
         the name of the author (if the ID is valid for an author), or "invalid"
         if the ID is not valid for either a paper or an author.
     """
     # First, check if it's a paper ID
     paper_response = requests.get(
+        f"https://api.semanticscholar.org/v1/paper/{semantic_scholar_id}",
+        timeout=5,
     )
     # If the response status code is 200, it means the ID is valid for a paper
     # Next, check if it's an author ID
     author_response = requests.get(
+        f"https://api.semanticscholar.org/v1/author/{semantic_scholar_id}",
+        timeout=5,
     )
     # If the response status code is 200, it means the ID is valid for an author
     return []
+def compute_stats_for_references(s2_ref_paper_keys, year):
+    """
+    Computes various statistics for a list of reference paper keys.
+    Args:
+        s2_ref_paper_keys (list): A list of Semantic Scholar paper keys for the references.
+        year (int): The year of the paper.
+    Returns:
+        tuple: A tuple containing the following statistics:
+            - num_references (int): The number of references.
+            - fields_of_study_counts (dict): A dictionary containing the count of each field of study.
+            - year_to_title_dict (dict): A dictionary mapping the year of each reference to its title.
+            - cfdi (float): The CFDI (Cumulative Field Diversity Index) of the references.
+            - cadi (float): The CADI (Cumulative Age Diversity Index) of the references.
+            - output_maoc (float): The MAOC (Mean Age of Citation) of the references.
+        If there are no valid references, returns a tuple of None values.
+    """
+    # Go over the references of the paper
+    reference_year_list = []
+    reference_title_list = []
+    reference_fos_list = []
+    with ThreadPoolExecutor() as executor:
+        request_url_refs = [
+            f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy"
+            for ref_paper_key in s2_ref_paper_keys
+        ]
+        futures = [
+            executor.submit(send_s2_request, request_url_ref)
+            for request_url_ref in request_url_refs
+        ]
+        for future in as_completed(futures):
+            r_ref = future.result()
+            if r_ref.status_code == 200:
+                result_ref = r_ref.json()
+                (title_ref, year_ref, fields_ref) = (
+                    result_ref["title"],
+                    result_ref["year"],
+                    result_ref["s2FieldsOfStudy"],
+                )
+                reference_year_list.append(year_ref)
+                reference_title_list.append(title_ref)
+                reference_fos_list.extend(
+                    field["category"]
+                    for field in fields_ref
+                    if field["source"] == "s2-fos-model"
+                )
+            else:
+                print(
+                    f"Error retrieving reference {r_ref.status_code} for"
+                    f" paper {s2_ref_paper_keys}"
+                )
+    # Remove all None from reference_year_list and reference_title_list
+    reference_year_list = [
+        year_ref for year_ref in reference_year_list if year_ref is not None
+    ]
+    reference_title_list = [
+        title_ref
+        for title_ref in reference_title_list
+        if title_ref is not None
+    ]
+    # Count references
+    num_references = len(reference_year_list)
+    # Flatten list and count occurrences
+    fields_of_study_counts = dict(
+        Counter(
+            [
+                field
+                for field in reference_fos_list
+                if "Computer Science" not in field
+            ]
+        )
+    )
+    # Citation age list
+    aoc_list = [
+        year - year_ref
+        for year_ref in reference_year_list
+        if year_ref and year
+    ]
+    if not aoc_list:
+        return None, None, None, None, None, None
+    # Compute citation age
+    output_maoc = sum(aoc_list) / len(aoc_list)
+    cadi = calculate_gini(aoc_list)
+    # Create a dictionary of year to title
+    year_to_title_dict = dict(zip(reference_year_list, reference_title_list))
+    # Compute CFDI
+    cfdi = calculate_gini_simpson(fields_of_study_counts)
+    # Return the results
+    return (
+        num_references,
+        fields_of_study_counts,
+        year_to_title_dict,
+        cfdi,
+        cadi,
+        output_maoc,
+    )
 def compute_stats_for_s2_paper(ssid_paper_id):
     """
     Computes statistics for a given paper ID using the Semantic Scholar API.
             title + "\n" + ", ".join([author["name"] for author in authors])
         )
+        (
+            num_references,
+            fields_of_study_counts,
+            year_to_title_dict,
+            cfdi,
+            cadi,
+            output_maoc,
+        ) = compute_stats_for_references(filtered_s2_ref_paper_keys, year)
         # Return the results
         return (
     return None
 def compute_stats_for_acl_author(url):
     """
     Computes statistics for an author's papers in the ACL anthology.
 def compute_stats_for_acl_venue(url):
+    """
+    Computes statistics for papers in a given ACL venue.
+    Args:
+        url (str): The URL of the ACL venue.
+    Returns:
+        dict: A dictionary containing statistics for the papers in the venue.
+    """
     if paper_info := extract_venue_info(url):
         loop = get_or_create_eventloop()
         tasks = [
     return None
+def compute_stats_for_multiple_s2_papers(
+    papers: List[dict], title: str
+) -> Tuple[str, int, dict, dict, float, float, float]:
+    """
+    Computes statistics for multiple S2 papers.
+    Args:
+        papers (List[dict]): A list of S2 papers.
+        title (str): The title of the papers.
+    Returns:
+        A tuple containing the following statistics:
+        - title (str): The title of the papers.
+        - num_references (int): The total number of references in all papers.
+        - top_fields (dict): A dictionary containing the top fields and their counts.
+        - oldest_paper_dict (dict): A dictionary containing the oldest paper for each year.
+        - cfdi (float): The average CFDI score for all papers.
+        - cadi (float): The average CADI score for all papers.
+        - output_maoc (float): The average output MAOC score for all papers.
+    """
     num_references = 0
     top_fields = {}
     oldest_paper_dict = {}
         num_references += results[1]
         for field, count in results[2].items():
             top_fields[field] = top_fields.get(field, 0) + count
+        for year, ref_title in results[3].items():
+            oldest_paper_dict[year] = ref_title
         cfdi += results[4]
         cadi += results[5]
         output_maoc += results[6]
         cadi / len(papers),
         output_maoc / len(papers),
     )
+async def send_s2_async_request(url):
+    """
+    Sends an asynchronous request to the specified URL and returns the response as a JSON object.
+    Args:
+        url (str): The URL to send the request to.
+    Returns:
+        dict: The response from the URL as a JSON object.
+    """
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            return await response.json()
+async def match_title_to_s2_paper(title, authors=None):
+    """
+    Matches a given paper title (and authors) to Semantic Scholar to retrieve its S2 paper ID.
+    Args:
+        title (str): The title of the paper.
+        authors (List[str], optional): List of authors of the paper. Defaults to None.
+    Returns:
+        str or None: Returns the S2 paper ID if found, otherwise None.
+    """
+    # Send a request to the Semantic Scholar API to search for the paper by its title
+    search_url = (
+        f"http://api.semanticscholar.org/graph/v1/paper/search?query={title}"
+    )
+    # Send request
+    response = await send_s2_async_request(search_url)
+    results = response.get("data", [])
+    if len(results) > 0:
+        result = results[0]  # Ranked by relevance
+        return result.get("paperId")
+async def compute_stats_for_pdf(pdf_file):
+    """
+    Computes statistics for a given PDF file.
+    Args:
+        pdf_file (file): The PDF file to compute statistics for.
+    Returns:
+        tuple: A tuple containing the title of the article and the computed statistics.
+    """
+    s2_paper_ids = []
+    article_dict = parse_pdf_to_artcile_dict(pdf_file.name)
+    references = article_dict["references"]
+    # Get S2 paper IDs asynchronously
+    tasks = [
+        match_title_to_s2_paper(reference["title"], reference["authors"])
+        for reference in references
+        if reference["title"]
+    ]
+    s2_paper_ids = await asyncio.gather(*tasks)
+    # Remove all None values from s2paperids
+    s2_paper_ids = [s2_id for s2_id in s2_paper_ids if s2_id is not None]
+    # Compute the current year
+    today = datetime.date.today()
+    year = int(today.strftime("%Y"))
+    results = compute_stats_for_references(s2_paper_ids, year)
+    results = (article_dict["title"],) + results
+    return results