当前位置: 首页 > news >正文

文献分区及影响因子批量查询

针对文献下载后的影响因子查询问题,手动逐个查询效率较低,而使用Zotero等工具配合插件操作又过于繁琐。为此,我们开发了一个Python封装模块,能够自动批量处理文件夹中的文献,快速查询并生成分析报告。

请注意,使用本模块前需提前获取Easy Scholar的密钥。

主要模块

我们封装了一个类,用于实现相应的功能。

import time import re import requests import pandas as pd from pathlib import Path from urllib.parse import quote import fitz # PyMuPDF import matplotlib.pyplot as plt from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm # pip install tqdm class JournalLiteratureStatistics: """ 全流程科研文献分析工具 功能:PDF解析 -> DOI识别 -> Crossref/EasyScholar查询 -> 统计绘图 -> Word/Excel报告 修复:支持 Excel 非法字符清洗、修复 Word 图片路径报错、多线程加速 """ def __init__(self, papers_dir, secret_key, email="your_email@example.com", max_workers=5): self.papers_dir = Path(papers_dir) self.secret_key = secret_key # Crossref "Polite Pool" headers self.headers = { "User-Agent": f"LiteratureStats/1.0 (mailto:{email})" } self.max_workers = max_workers if not self.papers_dir.exists(): raise FileNotFoundError(f"目录不存在: {self.papers_dir}") # ========================================================= # 0️⃣ 辅助工具:清洗 Excel 非法字符 # ========================================================= @staticmethod def clean_text(val): """清洗 Excel (XML) 不支持的 ASCII 控制字符""" if isinstance(val, str): # 移除 0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', val) return val # ========================================================= # 1️⃣ PDF → Title + DOI (增强版) # ========================================================= def extract_pdf_metadata(self, pdf_path: Path) -> dict: info = {"File": pdf_path.name, "Title": None, "DOI": None} try: with fitz.open(pdf_path) as doc: # 优先读取元数据 meta = doc.metadata or {} info["Title"] = meta.get("title") # 读取文本用于正则匹配 text = doc[0].get_text("text") if len(doc) > 0 else "" # 增强型 DOI 正则 doi_pattern = r'\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b' m = re.search(doi_pattern, text, re.I) if m: info["DOI"] = m.group(1) # 标题保底策略 if not info["Title"] or len(info["Title"]) < 5 or "Untitled" in info["Title"]: lines = [l.strip() for l in text.split("\n") if l.strip()] potential_titles = [l for l in lines[:15] if 10 < len(l) < 200] if potential_titles: info["Title"] = max(potential_titles, key=len) except Exception as e: info["Error_PDF"] = str(e) return info # ========================================================= # 2️⃣ DOI → Crossref # ========================================================= def query_crossref(self, doi: str) -> dict: if not doi: return {} url = f"https://api.crossref.org/works/{doi}" try: r = requests.get(url, headers=self.headers, timeout=15) r.raise_for_status() msg = r.json().get("message", {}) return { "Journal": msg.get("container-title", [None])[0], "ISSN": ",".join(msg.get("ISSN", [])) if msg.get("ISSN") else None, "Crossref_Title": msg.get("title", [None])[0] # 更准确的标题 } except Exception as e: return {"Error_Crossref": str(e)} # ========================================================= # 3️⃣ easyScholar → IF / 分区 # ========================================================= def query_journal_rank(self, journal_name: str) -> dict: if not journal_name: return {} url = "https://www.easyscholar.cc/open/getPublicationRank" params = { "secretKey": self.secret_key, "publicationName": quote(journal_name) } try: r = requests.get(url, params=params, timeout=15) r.raise_for_status() res = r.json() if res.get("code") != 200: return {"Error_Rank": res.get("msg", "Unknown EasyScholar Error")} official = (res.get("data", {}) .get("officialRank", {}) .get("all", {})) if not official: return {"Warning_Rank": "No ranking data found"} return { "SCI_IF": official.get("sciif"), "SCI_IF_5yr": official.get("sciif5"), "SCI_JCR": official.get("sci"), "CAS_Upgrade": official.get("sciUp"), "CAS_Warning": official.get("sciwarn"), } except Exception as e: return {"Error_Rank": str(e)} # ========================================================= # 单任务流水线 # ========================================================= def process_single_paper(self, pdf_path): # Step 1: 提取 info = self.extract_pdf_metadata(pdf_path) # Step 2: 联网查询 if info.get("DOI"): # Crossref cr = self.query_crossref(info["DOI"]) info.update(cr) # 使用 Crossref 标题覆盖 PDF 提取的标题(通常更规范) if cr.get("Crossref_Title"): info["Title"] = cr["Crossref_Title"] # EasyScholar if cr.get("Journal"): rank = self.query_journal_rank(cr.get("Journal")) info.update(rank) else: info["Error"] = "Journal name missing from Crossref" else: info["Error"] = "DOI extraction failed" return info # ========================================================= # 4️⃣ 批量分析 (多线程) # ========================================================= def batch_analyze(self) -> pd.DataFrame: pdf_files = list(self.papers_dir.glob("*.pdf")) records = [] print(f"🚀 开始分析 {len(pdf_files)} 篇文献 (并发数: {self.max_workers})...") with ThreadPoolExecutor(max_workers=self.max_workers) as executor: future_to_pdf = {executor.submit(self.process_single_paper, pdf): pdf for pdf in pdf_files} for future in tqdm(as_completed(future_to_pdf), total=len(pdf_files), unit="paper"): try: data = future.result() records.append(data) except Exception as e: pdf = future_to_pdf[future] records.append({"File": pdf.name, "Error": f"Crash: {str(e)}"}) return pd.DataFrame(records) # ========================================================= # 5️⃣ 统计汇总 # ========================================================= def summarize(self, df: pd.DataFrame) -> dict: if df.empty: return {} return { "Total papers": len(df), "DOI detected": df["DOI"].notna().sum(), "Journals identified": df["Journal"].notna().sum(), "SCI Q1": (df["SCI_JCR"] == "Q1").sum(), "CAS 1st Quartile": (df["CAS_Upgrade"] == "1区").sum(), "CAS Warning": df["CAS_Warning"].notna().sum() } # ========================================================= # 6️⃣ 构建期刊表 # ========================================================= def build_journal_table(self, df: pd.DataFrame) -> pd.DataFrame: if "Journal" not in df.columns or df.empty: return pd.DataFrame() df2 = df.copy() df2["SCI_IF"] = pd.to_numeric(df2["SCI_IF"], errors="coerce") grouped = ( df2.groupby("Journal") .agg( Count=("Journal", "count"), Mean_IF=("SCI_IF", "mean"), JCR=("SCI_JCR", "first"), CAS=("CAS_Upgrade", "first") ) .sort_values("Count", ascending=False) .reset_index() ) grouped["Mean_IF"] = grouped["Mean_IF"].round(2) return grouped # ========================================================= # 7️⃣ 绘图 (美化版) # ========================================================= def plot_and_save_figures(self, df: pd.DataFrame, fig_dir): fig_dir = Path(fig_dir) fig_dir.mkdir(parents=True, exist_ok=True) # 字体设置 (避免中文乱码,优先使用 Arial 或系统默认无衬线) plt.rcParams.update({'font.sans-serif': ['Arial', 'DejaVu Sans', 'SimHei'], 'font.size': 12}) # Fig 1: IF 分布 if "SCI_IF" in df.columns: if_series = pd.to_numeric(df["SCI_IF"], errors="coerce").dropna() if not if_series.empty: fig, ax = plt.subplots(figsize=(6, 4)) ax.hist(if_series, bins=10, color='#4c72b0', edgecolor='black', alpha=0.8) ax.set_xlabel("SCI Impact Factor (IF)") ax.set_ylabel("Count") ax.set_title("Distribution of Impact Factors", pad=12) ax.grid(axis='y', linestyle='--', alpha=0.5) plt.tight_layout() fig.savefig(fig_dir / "Fig1_IF_distribution.png", dpi=300) plt.close(fig) # Fig 2: JCR 分区 if "SCI_JCR" in df.columns: jcr_counts = df["SCI_JCR"].value_counts().sort_index() if not jcr_counts.empty: fig, ax = plt.subplots(figsize=(5, 5)) colors = ['#5b9bd5', '#ed7d31', '#a5a5a5', '#ffc000'] wedges, texts, autotexts = ax.pie( jcr_counts, labels=jcr_counts.index, autopct="%1.1f%%", startangle=90, colors=colors[:len(jcr_counts)] ) plt.setp(autotexts, size=10, weight="bold", color="white") ax.set_title("JCR Quartile Distribution", pad=12) plt.tight_layout() fig.savefig(fig_dir / "Fig2_JCR_quartile.png", dpi=300) plt.close(fig) # ========================================================= # 8️⃣ 输出 Word (修复 Path 报错) # ========================================================= def export_word_report(self, df, table_df, fig_dir, out_docx): doc = Document() doc.add_heading("Bibliometric Analysis Report", level=0).alignment = WD_ALIGN_PARAGRAPH.CENTER # 文本段落 s = self.summarize(df) if s: doc.add_heading("1. Summary", level=1) para = ( f"This study analyzed {s['Total papers']} journal articles. " f"DOIs were extracted from {s['DOI detected']} files. " f"JCR Q1 papers: {s['SCI Q1']}. " f"CAS Tier 1 papers: {s['CAS 1st Quartile']}." ) doc.add_paragraph(para) # 表格 if not table_df.empty: doc.add_heading("2. Journal Distribution", level=1) table = doc.add_table(rows=len(table_df) + 1, cols=len(table_df.columns)) table.style = 'Table Grid' # 表头 for i, col in enumerate(table_df.columns): table.rows[0].cells[i].text = str(col) table.rows[0].cells[i].paragraphs[0].runs[0].bold = True # 内容 for i, row in table_df.iterrows(): for j, val in enumerate(row): table.rows[i + 1].cells[j].text = str(val) if pd.notnull(val) else "-" # 图片 doc.add_heading("3. Statistical Figures", level=1) fig_dir = Path(fig_dir) def add_fig(name, caption): path = fig_dir / name # 关键修复:使用 str(path) 避免 PosixPath seek 错误 if path.exists(): doc.add_picture(str(path), width=Inches(5.0)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER doc.add_paragraph(caption).alignment = WD_ALIGN_PARAGRAPH.CENTER add_fig("Fig1_IF_distribution.png", "Figure 1. IF Distribution") add_fig("Fig2_JCR_quartile.png", "Figure 2. JCR Quartile") doc.save(out_docx) print(f"✅ Word 报告已保存: {out_docx}")

调用方式

请将上面的模块保存为一个python文件,文件名可以命名为literature_manipulation.py,并将该模块与调用的python文件放在同一目录下。

import literature_manipulation from pathlib import Path #%% # 配置区 PAPERS_DIR = "" # PDF 文件夹路径 SECRET_KEY = "" # easyScholar Key OUTPUT_DIR = "./output" # 输出结果路径 EMAIL = "youremail@uni.edu" # 用于 Crossref 验证 # 实例化 tool = literature_manipulation.JournalLiteratureStatistics( papers_dir=PAPERS_DIR, secret_key=SECRET_KEY, email=EMAIL ) try: # 1. 批量分析 df_result = tool.batch_analyze() # 2. 清洗数据(修复 IllegalCharacterError) # 使用 map (pandas 2.1+) 或 applymap (旧版) try: df_result = df_result.map(tool.clean_text) except AttributeError: df_result = df_result.applymap(tool.clean_text) # 3. 导出 Excel Path(OUTPUT_DIR).mkdir(exist_ok=True) df_result.to_excel(f"{OUTPUT_DIR}/Literature_Stats_High_value.xlsx", index=False) print(f"✅ Excel 数据已保存") # 4. 生成图表和 Word df_journals = tool.build_journal_table(df_result) tool.plot_and_save_figures(df_result, fig_dir=OUTPUT_DIR) tool.export_word_report(df_result, df_journals, fig_dir=OUTPUT_DIR, out_docx=f"{OUTPUT_DIR}/Report.docx") except Exception as e: print(f"❌ 程序运行出错: {e}") import traceback traceback.print_exc()

制定文件夹路径,填入密钥,填入礼貌性的访问邮件,运行即可生成相关报告和Excel表格。

http://www.cnnetsun.cn/news/106768.html

相关文章:

  • APKMirror安卓应用下载平台深度解析:从源码到实践
  • 终极FreeMarker模板调试工具:3分钟解决模板语法问题
  • QQScreenShot独立版技术解析:基于模块化架构的屏幕捕捉解决方案
  • 快速掌握SCPI Parser终极指南:构建专业仪器控制系统的完整解决方案
  • 自定义算子的“诞生记”:基于CANN Kernel自调工程的完整CI/CD流水线
  • 高效、稳定、可定制——EmotiVoice开源TTS优势全解析
  • 大模型应用开发(十八)_向量检索
  • NVIDIA显卡设置终极指南:从问题诊断到性能优化的完整解决方案
  • 聚星成链,蓝卓牵头成立“工厂操作系统生态联盟”共建产业新生态
  • 每天一道面试题之架构篇|可靠订单状态机与事务消息架构设计
  • 10分钟掌握开源美颜SDK核心技术:从算法原理到商业应用实战
  • EmotiVoice支持哪些语言?多语种语音合成能力测试报告
  • AI语音合成进入情感时代:EmotiVoice带来全新听觉体验
  • EmotiVoice支持WebAssembly吗?浏览器端运行可能性分析
  • StaMPS雷达数据处理:从零搭建专业位移监测系统
  • yt-dlp-gui终极指南:轻松掌握Windows视频下载利器
  • EmotiVoice是否支持语音情感随机扰动?增强自然感功能
  • QRemeshify终极指南:快速创建高质量四边形网格的完整教程
  • 如何免费获得高质量语音合成能力?EmotiVoice给你答案
  • Hive SQL中COALESCE 函数和NVL()函数、IFNULL函数区别
  • 四边形网格生成实战指南:掌握QuadriFlow高效工作流
  • 如何快速解决AMD GPU识别问题:终极故障排查指南
  • OpenProject企业版深度解析:从开源到商业化的全面升级
  • Next.js认证系统实战:基于Clerk的完整解决方案
  • DeepBench如何帮助你在5分钟内完成深度学习硬件性能精准评估?
  • PCB文件处理终极指南:用Python轻松解析Gerber和Excellon文件
  • 革命性API测试工具:WireMock UI让接口模拟变得前所未有的简单
  • EmotiVoice能否用于智能家居控制反馈?轻量级语音提示生成
  • Lime编辑器极速上手:从零到精通的避坑指南
  • Wan2.2模型AI视频生成实战指南:从设备配置到创意实现