当前位置：首页 > news >正文

文献分区及影响因子批量查询

news 2026/6/28 13:55:03

针对文献下载后的影响因子查询问题，手动逐个查询效率较低，而使用Zotero等工具配合插件操作又过于繁琐。为此，我们开发了一个Python封装模块，能够自动批量处理文件夹中的文献，快速查询并生成分析报告。

请注意，使用本模块前需提前获取Easy Scholar的密钥。

主要模块

我们封装了一个类，用于实现相应的功能。

import time import re import requests import pandas as pd from pathlib import Path from urllib.parse import quote import fitz # PyMuPDF import matplotlib.pyplot as plt from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm # pip install tqdm class JournalLiteratureStatistics: """ 全流程科研文献分析工具 功能：PDF解析 -> DOI识别 -> Crossref/EasyScholar查询 -> 统计绘图 -> Word/Excel报告 修复：支持 Excel 非法字符清洗、修复 Word 图片路径报错、多线程加速 """ def __init__(self, papers_dir, secret_key, email="your_email@example.com", max_workers=5): self.papers_dir = Path(papers_dir) self.secret_key = secret_key # Crossref "Polite Pool" headers self.headers = { "User-Agent": f"LiteratureStats/1.0 (mailto:{email})" } self.max_workers = max_workers if not self.papers_dir.exists(): raise FileNotFoundError(f"目录不存在: {self.papers_dir}") # ========================================================= # 0️⃣ 辅助工具：清洗 Excel 非法字符 # ========================================================= @staticmethod def clean_text(val): """清洗 Excel (XML) 不支持的 ASCII 控制字符""" if isinstance(val, str): # 移除 0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', val) return val # ========================================================= # 1️⃣ PDF → Title + DOI (增强版) # ========================================================= def extract_pdf_metadata(self, pdf_path: Path) -> dict: info = {"File": pdf_path.name, "Title": None, "DOI": None} try: with fitz.open(pdf_path) as doc: # 优先读取元数据 meta = doc.metadata or {} info["Title"] = meta.get("title") # 读取文本用于正则匹配 text = doc[0].get_text("text") if len(doc) > 0 else "" # 增强型 DOI 正则 doi_pattern = r'\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b' m = re.search(doi_pattern, text, re.I) if m: info["DOI"] = m.group(1) # 标题保底策略 if not info["Title"] or len(info["Title"]) < 5 or "Untitled" in info["Title"]: lines = [l.strip() for l in text.split("\n") if l.strip()] potential_titles = [l for l in lines[:15] if 10 < len(l) < 200] if potential_titles: info["Title"] = max(potential_titles, key=len) except Exception as e: info["Error_PDF"] = str(e) return info # ========================================================= # 2️⃣ DOI → Crossref # ========================================================= def query_crossref(self, doi: str) -> dict: if not doi: return {} url = f"https://api.crossref.org/works/{doi}" try: r = requests.get(url, headers=self.headers, timeout=15) r.raise_for_status() msg = r.json().get("message", {}) return { "Journal": msg.get("container-title", [None])[0], "ISSN": ",".join(msg.get("ISSN", [])) if msg.get("ISSN") else None, "Crossref_Title": msg.get("title", [None])[0] # 更准确的标题 } except Exception as e: return {"Error_Crossref": str(e)} # ========================================================= # 3️⃣ easyScholar → IF / 分区 # ========================================================= def query_journal_rank(self, journal_name: str) -> dict: if not journal_name: return {} url = "https://www.easyscholar.cc/open/getPublicationRank" params = { "secretKey": self.secret_key, "publicationName": quote(journal_name) } try: r = requests.get(url, params=params, timeout=15) r.raise_for_status() res = r.json() if res.get("code") != 200: return {"Error_Rank": res.get("msg", "Unknown EasyScholar Error")} official = (res.get("data", {}) .get("officialRank", {}) .get("all", {})) if not official: return {"Warning_Rank": "No ranking data found"} return { "SCI_IF": official.get("sciif"), "SCI_IF_5yr": official.get("sciif5"), "SCI_JCR": official.get("sci"), "CAS_Upgrade": official.get("sciUp"), "CAS_Warning": official.get("sciwarn"), } except Exception as e: return {"Error_Rank": str(e)} # ========================================================= # 单任务流水线 # ========================================================= def process_single_paper(self, pdf_path): # Step 1: 提取 info = self.extract_pdf_metadata(pdf_path) # Step 2: 联网查询 if info.get("DOI"): # Crossref cr = self.query_crossref(info["DOI"]) info.update(cr) # 使用 Crossref 标题覆盖 PDF 提取的标题（通常更规范） if cr.get("Crossref_Title"): info["Title"] = cr["Crossref_Title"] # EasyScholar if cr.get("Journal"): rank = self.query_journal_rank(cr.get("Journal")) info.update(rank) else: info["Error"] = "Journal name missing from Crossref" else: info["Error"] = "DOI extraction failed" return info # ========================================================= # 4️⃣ 批量分析 (多线程) # ========================================================= def batch_analyze(self) -> pd.DataFrame: pdf_files = list(self.papers_dir.glob("*.pdf")) records = [] print(f"🚀 开始分析 {len(pdf_files)} 篇文献 (并发数: {self.max_workers})...") with ThreadPoolExecutor(max_workers=self.max_workers) as executor: future_to_pdf = {executor.submit(self.process_single_paper, pdf): pdf for pdf in pdf_files} for future in tqdm(as_completed(future_to_pdf), total=len(pdf_files), unit="paper"): try: data = future.result() records.append(data) except Exception as e: pdf = future_to_pdf[future] records.append({"File": pdf.name, "Error": f"Crash: {str(e)}"}) return pd.DataFrame(records) # ========================================================= # 5️⃣ 统计汇总 # ========================================================= def summarize(self, df: pd.DataFrame) -> dict: if df.empty: return {} return { "Total papers": len(df), "DOI detected": df["DOI"].notna().sum(), "Journals identified": df["Journal"].notna().sum(), "SCI Q1": (df["SCI_JCR"] == "Q1").sum(), "CAS 1st Quartile": (df["CAS_Upgrade"] == "1区").sum(), "CAS Warning": df["CAS_Warning"].notna().sum() } # ========================================================= # 6️⃣ 构建期刊表 # ========================================================= def build_journal_table(self, df: pd.DataFrame) -> pd.DataFrame: if "Journal" not in df.columns or df.empty: return pd.DataFrame() df2 = df.copy() df2["SCI_IF"] = pd.to_numeric(df2["SCI_IF"], errors="coerce") grouped = ( df2.groupby("Journal") .agg( Count=("Journal", "count"), Mean_IF=("SCI_IF", "mean"), JCR=("SCI_JCR", "first"), CAS=("CAS_Upgrade", "first") ) .sort_values("Count", ascending=False) .reset_index() ) grouped["Mean_IF"] = grouped["Mean_IF"].round(2) return grouped # ========================================================= # 7️⃣ 绘图 (美化版) # ========================================================= def plot_and_save_figures(self, df: pd.DataFrame, fig_dir): fig_dir = Path(fig_dir) fig_dir.mkdir(parents=True, exist_ok=True) # 字体设置 (避免中文乱码，优先使用 Arial 或系统默认无衬线) plt.rcParams.update({'font.sans-serif': ['Arial', 'DejaVu Sans', 'SimHei'], 'font.size': 12}) # Fig 1: IF 分布 if "SCI_IF" in df.columns: if_series = pd.to_numeric(df["SCI_IF"], errors="coerce").dropna() if not if_series.empty: fig, ax = plt.subplots(figsize=(6, 4)) ax.hist(if_series, bins=10, color='#4c72b0', edgecolor='black', alpha=0.8) ax.set_xlabel("SCI Impact Factor (IF)") ax.set_ylabel("Count") ax.set_title("Distribution of Impact Factors", pad=12) ax.grid(axis='y', linestyle='--', alpha=0.5) plt.tight_layout() fig.savefig(fig_dir / "Fig1_IF_distribution.png", dpi=300) plt.close(fig) # Fig 2: JCR 分区 if "SCI_JCR" in df.columns: jcr_counts = df["SCI_JCR"].value_counts().sort_index() if not jcr_counts.empty: fig, ax = plt.subplots(figsize=(5, 5)) colors = ['#5b9bd5', '#ed7d31', '#a5a5a5', '#ffc000'] wedges, texts, autotexts = ax.pie( jcr_counts, labels=jcr_counts.index, autopct="%1.1f%%", startangle=90, colors=colors[:len(jcr_counts)] ) plt.setp(autotexts, size=10, weight="bold", color="white") ax.set_title("JCR Quartile Distribution", pad=12) plt.tight_layout() fig.savefig(fig_dir / "Fig2_JCR_quartile.png", dpi=300) plt.close(fig) # ========================================================= # 8️⃣ 输出 Word (修复 Path 报错) # ========================================================= def export_word_report(self, df, table_df, fig_dir, out_docx): doc = Document() doc.add_heading("Bibliometric Analysis Report", level=0).alignment = WD_ALIGN_PARAGRAPH.CENTER # 文本段落 s = self.summarize(df) if s: doc.add_heading("1. Summary", level=1) para = ( f"This study analyzed {s['Total papers']} journal articles. " f"DOIs were extracted from {s['DOI detected']} files. " f"JCR Q1 papers: {s['SCI Q1']}. " f"CAS Tier 1 papers: {s['CAS 1st Quartile']}." ) doc.add_paragraph(para) # 表格 if not table_df.empty: doc.add_heading("2. Journal Distribution", level=1) table = doc.add_table(rows=len(table_df) + 1, cols=len(table_df.columns)) table.style = 'Table Grid' # 表头 for i, col in enumerate(table_df.columns): table.rows[0].cells[i].text = str(col) table.rows[0].cells[i].paragraphs[0].runs[0].bold = True # 内容 for i, row in table_df.iterrows(): for j, val in enumerate(row): table.rows[i + 1].cells[j].text = str(val) if pd.notnull(val) else "-" # 图片 doc.add_heading("3. Statistical Figures", level=1) fig_dir = Path(fig_dir) def add_fig(name, caption): path = fig_dir / name # 关键修复：使用 str(path) 避免 PosixPath seek 错误 if path.exists(): doc.add_picture(str(path), width=Inches(5.0)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER doc.add_paragraph(caption).alignment = WD_ALIGN_PARAGRAPH.CENTER add_fig("Fig1_IF_distribution.png", "Figure 1. IF Distribution") add_fig("Fig2_JCR_quartile.png", "Figure 2. JCR Quartile") doc.save(out_docx) print(f"✅ Word 报告已保存: {out_docx}")

调用方式

请将上面的模块保存为一个python文件，文件名可以命名为literature_manipulation.py，并将该模块与调用的python文件放在同一目录下。

import literature_manipulation from pathlib import Path #%% # 配置区 PAPERS_DIR = "" # PDF 文件夹路径 SECRET_KEY = "" # easyScholar Key OUTPUT_DIR = "./output" # 输出结果路径 EMAIL = "youremail@uni.edu" # 用于 Crossref 验证 # 实例化 tool = literature_manipulation.JournalLiteratureStatistics( papers_dir=PAPERS_DIR, secret_key=SECRET_KEY, email=EMAIL ) try: # 1. 批量分析 df_result = tool.batch_analyze() # 2. 清洗数据（修复 IllegalCharacterError） # 使用 map (pandas 2.1+) 或 applymap (旧版) try: df_result = df_result.map(tool.clean_text) except AttributeError: df_result = df_result.applymap(tool.clean_text) # 3. 导出 Excel Path(OUTPUT_DIR).mkdir(exist_ok=True) df_result.to_excel(f"{OUTPUT_DIR}/Literature_Stats_High_value.xlsx", index=False) print(f"✅ Excel 数据已保存") # 4. 生成图表和 Word df_journals = tool.build_journal_table(df_result) tool.plot_and_save_figures(df_result, fig_dir=OUTPUT_DIR) tool.export_word_report(df_result, df_journals, fig_dir=OUTPUT_DIR, out_docx=f"{OUTPUT_DIR}/Report.docx") except Exception as e: print(f"❌ 程序运行出错: {e}") import traceback traceback.print_exc()

制定文件夹路径，填入密钥，填入礼貌性的访问邮件，运行即可生成相关报告和Excel表格。

查看全文

http://www.cnnetsun.cn/news/106768.html