当前位置：首页 > news >正文

还在手动分析用户行为？RPA+AI解码希音消费密码，效率暴增100倍！[特殊字符]

news 2026/7/31 5:12:38

还在手动分析用户行为？RPA+AI解码希音消费密码，效率暴增100倍！🧠

"凌晨2点，数据分析师还在Excel里挣扎，试图从十万条用户数据中找出消费规律...这样的场景该用技术终结了！"

一、痛点直击：用户行为分析的「数据迷宫」

作为电商数据从业者，我深深理解用户行为分析的认知负担：

数据分散：用户数据散落在订单、浏览、搜索等多个系统中，难以整合
处理复杂：单次分析需要处理10万+条数据，手动操作耗时8-10小时
洞察困难：缺乏专业工具，难以从海量数据中发现深层规律
时效性差：分析结果滞后，错过最佳运营决策时机

上个月我们因为未能及时识别高价值用户流失趋势，导致季度复购率下降15%！这种痛，做数据分析的应该都感同身受。

二、解决方案：RPA+AI智能行为分析系统

是时候亮出影刀RPA+机器学习这个数据分析核武器了！

技术架构全景图

多源数据采集：RPA自动整合订单、浏览、搜索、收藏等全链路数据
智能用户分群：基于RFM模型和聚类算法自动划分用户群体
行为模式挖掘：使用关联规则和序列模式发现消费规律
预测模型构建：基于历史数据预测用户未来消费行为
可视化洞察：自动生成交互式数据看板和深度分析报告

整个方案最大亮点：从数据到洞察全自动完成！零人工干预，智能发现业务机会。

三、核心代码实现：手把手教学

3.1 环境准备与依赖库

# 核心库导入 from ydauth import AuthManager from ydweb import Browser from ydanalytics import BehaviorAnalyzer from yddatabase import DataWarehouse from ydml import ML_Processor import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime, timedelta import logging # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('user_behavior_analysis.log'), logging.StreamHandler() ] ) # 初始化数据分析组件 behavior_analyzer = BehaviorAnalyzer() ml_processor = ML_Processor() data_warehouse = DataWarehouse()

3.2 希音用户数据采集模块

def collect_shein_user_data(browser, data_range='30d'): """ 采集希音用户行为数据 Args: browser: 浏览器实例 data_range: 数据时间范围 Returns: user_data: 整合的用户数据集 """ user_data = {} try: # 1. 采集订单数据 logging.info("📦 开始采集订单数据...") order_data = fetch_order_data(browser, data_range) user_data['orders'] = order_data # 2. 采集浏览行为数据 logging.info("👀 开始采集浏览行为数据...") browse_data = fetch_browse_behavior(browser, data_range) user_data['browsing'] = browse_data # 3. 采集搜索数据 logging.info("🔍 开始采集搜索数据...") search_data = fetch_search_behavior(browser, data_range) user_data['search'] = search_data # 4. 采集收藏数据 logging.info("❤️ 开始采集收藏数据...") favorite_data = fetch_favorite_behavior(browser, data_range) user_data['favorites'] = favorite_data # 5. 采集购物车数据 logging.info("🛒 开始采集购物车数据...") cart_data = fetch_cart_behavior(browser, data_range) user_data['cart'] = cart_data logging.info(f"✅ 用户数据采集完成，共获取 {len(order_data)} 个用户的综合行为数据") return user_data except Exception as e: logging.error(f"用户数据采集失败: {str(e)}") raise def fetch_order_data(browser, data_range): """ 获取订单数据 """ try: # 导航到订单分析页面 browser.open_url("https://seller.shein.com/analytics/orders") browser.wait_element_visible("//div[@class='order-analytics']", timeout=10) # 设置时间范围 set_date_range(browser, data_range) # 获取订单列表 orders = [] page_count = get_total_pages(browser) for page in range(1, page_count + 1): if page > 1: browser.click(f"//a[contains(text(),'{page}')]") time.sleep(2) page_orders = extract_order_page_data(browser) orders.extend(page_orders) # 数据标准化处理 processed_orders = process_order_data(orders) return processed_orders except Exception as e: logging.error(f"获取订单数据失败: {str(e)}") return [] def extract_order_page_data(browser): """ 提取订单页面数据 """ orders = [] order_rows = browser.find_elements("//tr[contains(@class,'order-row')]") for row in order_rows: try: order_data = { 'user_id': browser.get_text(".//td[1]", element=row), 'order_id': browser.get_text(".//td[2]", element=row), 'order_time': browser.get_text(".//td[3]", element=row), 'order_amount': parse_currency(browser.get_text(".//td[4]", element=row)), 'product_count': int(browser.get_text(".//td[5]", element=row)), 'payment_method': browser.get_text(".//td[6]", element=row), 'order_status': browser.get_text(".//td[7]", element=row) } # 提取商品详情 detail_link = browser.find_element(".//a[contains(@href,'order-detail')]", element=row) product_details = extract_order_products(browser, detail_link) order_data['products'] = product_details orders.append(order_data) except Exception as e: logging.warning(f"提取订单数据失败: {str(e)}") continue return orders def fetch_browse_behavior(browser, data_range): """ 获取用户浏览行为数据 """ try: # 导航到用户行为分析页面 browser.open_url("https://seller.shein.com/analytics/user-behavior") browser.wait_element_visible("//div[@class='user-behavior']", timeout=10) # 设置时间范围 set_date_range(browser, data_range) # 提取浏览数据 browse_data = [] browse_rows = browser.find_elements("//tr[contains(@class,'browse-row')]") for row in browse_rows: try: browse_record = { 'user_id': browser.get_text(".//td[1]", element=row), 'session_id': browser.get_text(".//td[2]", element=row), 'page_url': browser.get_text(".//td[3]", element=row), 'view_time': int(browser.get_text(".//td[4]", element=row)), 'timestamp': browser.get_text(".//td[5]", element=row), 'product_id': extract_product_id_from_url(browser.get_text(".//td[3]", element=row)) } browse_data.append(browse_record) except Exception as e: logging.warning(f"提取浏览数据失败: {str(e)}") continue return browse_data except Exception as e: logging.error(f"获取浏览行为数据失败: {str(e)}") return []

3.3 用户分群与RFM分析引擎

class UserSegmentationEngine: """ 用户分群引擎 """ def __init__(self): self.segmentation_models = {} self.rfm_thresholds = self.init_rfm_thresholds() def init_rfm_thresholds(self): """ 初始化RFM阈值 """ return { 'recency': { 'high': 7, # 7天内 'medium': 30, # 30天内 'low': 90 # 90天内 }, 'frequency': { 'high': 10, # 10次以上 'medium': 5, # 5-10次 'low': 1 # 1-5次 }, 'monetary': { 'high': 2000, # 消费2000元以上 'medium': 500, # 500-2000元 'low': 100 # 100-500元 } } def calculate_rfm_scores(self, order_data): """ 计算用户RFM得分 """ rfm_data = {} # 按用户分组订单数据 user_orders = {} for order in order_data: user_id = order['user_id'] if user_id not in user_orders: user_orders[user_id] = [] user_orders[user_id].append(order) # 计算每个用户的RFM for user_id, orders in user_orders.items(): # Recency: 最近一次购买时间 latest_order = max(orders, key=lambda x: datetime.strptime(x['order_time'], '%Y-%m-%d %H:%M:%S')) recency_days = (datetime.now() - datetime.strptime(latest_order['order_time'], '%Y-%m-%d %H:%M:%S')).days # Frequency: 购买频率 frequency = len(orders) # Monetary: 总消费金额 monetary = sum(order['order_amount'] for order in orders) # RFM得分计算 rfm_scores = { 'recency_score': self.calculate_recency_score(recency_days), 'frequency_score': self.calculate_frequency_score(frequency), 'monetary_score': self.calculate_monetary_score(monetary), 'recency_days': recency_days, 'frequency_count': frequency, 'monetary_total': monetary } rfm_data[user_id] = rfm_scores return rfm_data def calculate_recency_score(self, recency_days): """ 计算最近性得分 """ if recency_days <= self.rfm_thresholds['recency']['high']: return 5 elif recency_days <= self.rfm_thresholds['recency']['medium']: return 4 elif recency_days <= self.rfm_thresholds['recency']['low']: return 3 else: return 2 def calculate_frequency_score(self, frequency): """ 计算频率得分 """ if frequency >= self.rfm_thresholds['frequency']['high']: return 5 elif frequency >= self.rfm_thresholds['frequency']['medium']: return 4 else: return 3 def calculate_monetary_score(self, monetary): """ 计算价值得分 """ if monetary >= self.rfm_thresholds['monetary']['high']: return 5 elif monetary >= self.rfm_thresholds['monetary']['medium']: return 4 else: return 3 def segment_users_by_rfm(self, rfm_data): """ 基于RFM进行用户分群 """ segments = { 'champions': [], # 高价值用户 'loyal_customers': [], # 忠诚用户 'potential_loyalists': [], # 潜在忠诚用户 'new_customers': [], # 新用户 'at_risk': [], # 流失风险用户 'cant_lose': [], # 重要挽留用户 'hibernating': [] # 休眠用户 } for user_id, scores in rfm_data.items(): r_score = scores['recency_score'] f_score = scores['frequency_score'] m_score = scores['monetary_score'] # 基于RFM得分进行分群 if r_score >= 4 and f_score >= 4 and m_score >= 4: segments['champions'].append(user_id) elif r_score >= 3 and f_score >= 3 and m_score >= 3: segments['loyal_customers'].append(user_id) elif r_score >= 4 and f_score >= 3 and m_score >= 3: segments['potential_loyalists'].append(user_id) elif r_score >= 4 and f_score <= 2 and m_score <= 2: segments['new_customers'].append(user_id) elif r_score <= 2 and f_score >= 3 and m_score >= 3: segments['at_risk'].append(user_id) elif r_score <= 2 and f_score >= 4 and m_score >= 4: segments['cant_lose'].append(user_id) else: segments['hibernating'].append(user_id) return segments def cluster_users_by_behavior(self, user_data, n_clusters=5): """ 基于行为特征进行聚类分析 """ # 构建行为特征矩阵 features = self.build_behavior_features(user_data) if len(features) < n_clusters: logging.warning(f"用户数量 {len(features)} 小于聚类数 {n_clusters}，调整聚类数") n_clusters = max(2, len(features) // 2) # 执行K-means聚类 kmeans = KMeans(n_clusters=n_clusters, random_state=42) cluster_labels = kmeans.fit_predict(features) # 分析聚类特征 cluster_analysis = self.analyze_clusters(features, cluster_labels, kmeans.cluster_centers_) return cluster_labels, cluster_analysis def build_behavior_features(self, user_data): """ 构建用户行为特征矩阵 """ features = [] for user_id, data in user_data.items(): feature_vector = [ data.get('order_count', 0), # 订单数量 data.get('total_spent', 0), # 总消费金额 data.get('avg_order_value', 0), # 平均订单价值 data.get('browse_sessions', 0), # 浏览会话数 data.get('avg_session_time', 0), # 平均会话时长 data.get('search_count', 0), # 搜索次数 data.get('favorite_count', 0), # 收藏次数 data.get('cart_additions', 0), # 加购次数 data.get('product_categories', 0), # 购买品类数 data.get('last_activity_days', 30) # 最近活跃天数 ] features.append(feature_vector) return np.array(features)

3.4 消费行为模式挖掘

class BehaviorPatternMiner: """ 行为模式挖掘引擎 """ def __init__(self): self.association_rules = {} self.sequence_patterns = {} def mine_association_rules(self, order_data, min_support=0.01, min_confidence=0.5): """ 挖掘商品关联规则 """ try: # 构建交易数据集 transactions = self.build_transaction_dataset(order_data) # 使用Apriori算法挖掘频繁项集 frequent_itemsets = self.apriori_algorithm(transactions, min_support) # 生成关联规则 association_rules = self.generate_association_rules(frequent_itemsets, min_confidence) logging.info(f"✅ 关联规则挖掘完成，共发现 {len(association_rules)} 条强规则") return association_rules except Exception as e: logging.error(f"关联规则挖掘失败: {str(e)}") return {} def build_transaction_dataset(self, order_data): """ 构建交易数据集 """ transactions = {} for order in order_data: user_id = order['user_id'] products = [product['product_id'] for product in order.get('products', [])] if user_id not in transactions: transactions[user_id] = [] transactions[user_id].extend(products) # 转换为事务列表 transaction_list = list(transactions.values()) return transaction_list def apriori_algorithm(self, transactions, min_support): """ Apriori算法实现 """ from collections import defaultdict # 计算单项支持度 item_counts = defaultdict(int) total_transactions = len(transactions) for transaction in transactions: for item in set(transaction): item_counts[item] += 1 # 生成频繁1项集 frequent_itemsets = {} k = 1 frequent_k = {} for item, count in item_counts.items(): support = count / total_transactions if support >= min_support: frequent_k[frozenset([item])] = support frequent_itemsets[k] = frequent_k # 迭代生成更大项集 k = 2 while frequent_itemsets[k-1]: # 生成候选集 candidates = self.generate_candidates(frequent_itemsets[k-1], k) # 计算支持度 candidate_counts = defaultdict(int) for transaction in transactions: transaction_set = set(transaction) for candidate in candidates: if candidate.issubset(transaction_set): candidate_counts[candidate] += 1 # 筛选频繁项集 frequent_k = {} for itemset, count in candidate_counts.items(): support = count / total_transactions if support >= min_support: frequent_k[itemset] = support frequent_itemsets[k] = frequent_k k += 1 return frequent_itemsets def analyze_purchase_sequences(self, user_data, max_sequence_length=5): """ 分析购买序列模式 """ sequences = {} for user_id, data in user_data.items(): # 按时间排序的购买序列 orders = sorted(data.get('orders', []), key=lambda x: datetime.strptime(x['order_time'], '%Y-%m-%d %H:%M:%S')) product_sequence = [] for order in orders: products = [p['product_id'] for p in order.get('products', [])] product_sequence.extend(products) # 记录序列模式 if len(product_sequence) >= 2: sequences[user_id] = product_sequence # 分析常见序列模式 sequence_patterns = self.find_common_sequences(sequences, max_sequence_length) return sequence_patterns def find_common_sequences(self, sequences, max_length): """ 发现常见序列模式 """ sequence_counts = defaultdict(int) for user_id, sequence in sequences.items(): # 提取所有可能的子序列 for length in range(2, min(max_length + 1, len(sequence) + 1)): for i in range(len(sequence) - length + 1): sub_sequence = tuple(sequence[i:i+length]) sequence_counts[sub_sequence] += 1 # 筛选常见序列 total_users = len(sequences) common_sequences = {} for seq, count in sequence_counts.items(): support = count / total_users if support >= 0.05: # 支持度超过5% common_sequences[seq] = { 'support': support, 'count': count, 'length': len(seq) } return common_sequences

3.5 用户行为预测模型

class UserBehaviorPredictor: """ 用户行为预测模型 """ def __init__(self): self.prediction_models = {} self.feature_importance = {} def build_churn_prediction_model(self, user_data, label_data): """ 构建用户流失预测模型 """ try: # 构建特征矩阵和标签 features, labels = self.prepare_churn_data(user_data, label_data) if len(features) < 100: logging.warning("训练数据不足，模型效果可能受限") # 训练随机森林分类器 rf_model = RandomForestClassifier( n_estimators=100, max_depth=10, random_state=42, class_weight='balanced' ) rf_model.fit(features, labels) # 保存特征重要性 self.feature_importance['churn'] = dict(zip( ['order_count', 'total_spent', 'recency_days', 'browse_frequency', 'session_duration', 'search_count', 'favorite_ratio'], rf_model.feature_importances_ )) self.prediction_models['churn'] = rf_model logging.info("✅ 用户流失预测模型训练完成") return rf_model except Exception as e: logging.error(f"构建流失预测模型失败: {str(e)}") raise def prepare_churn_data(self, user_data, label_data): """ 准备流失预测数据 """ features = [] labels = [] for user_id, data in user_data.items(): if user_id in label_data: feature_vector = [ data.get('order_count', 0), data.get('total_spent', 0), data.get('recency_days', 30), data.get('browse_frequency', 0), data.get('avg_session_duration', 0), data.get('search_count', 0), data.get('favorite_ratio', 0) ] features.append(feature_vector) labels.append(label_data[user_id]) return np.array(features), np.array(labels) def predict_purchase_propensity(self, user_data, product_categories): """ 预测用户购买倾向 """ propensity_scores = {} for user_id, data in user_data.items(): # 基于用户历史行为计算购买倾向 base_score = self.calculate_base_propensity(data) # 基于品类偏好调整分数 category_boost = self.calculate_category_affinity(data, product_categories) # 最终倾向分数 final_score = base_score * (1 + category_boost) propensity_scores[user_id] = { 'base_score': base_score, 'category_boost': category_boost, 'final_score': final_score, 'predicted_category': self.predict_preferred_category(data, product_categories) } return propensity_scores def calculate_base_propensity(self, user_data): """ 计算基础购买倾向 """ score = 0 # 最近活跃度权重 recency_days = user_data.get('recency_days', 30) if recency_days <= 7: score += 0.4 elif recency_days <= 30: score += 0.2 # 浏览行为权重 browse_frequency = user_data.get('browse_frequency', 0) if browse_frequency > 10: score += 0.3 elif browse_frequency > 5: score += 0.15 # 加购收藏权重 cart_ratio = user_data.get('cart_addition_ratio', 0) favorite_ratio = user_data.get('favorite_ratio', 0) score += (cart_ratio + favorite_ratio) * 0.3 return min(score, 1.0)

3.6 智能洞察与可视化报告

def generate_behavior_insights(user_segments, pattern_mining, predictions): """ 生成行为洞察报告 """ insights = { 'executive_summary': generate_executive_summary(user_segments), 'segment_analysis': analyze_user_segments(user_segments), 'behavior_patterns': extract_key_patterns(pattern_mining), 'prediction_insights': generate_prediction_insights(predictions), 'actionable_recommendations': generate_recommendations(user_segments, predictions) } # 生成可视化图表 visualization_paths = create_visualizations(insights) insights['visualizations'] = visualization_paths return insights def generate_executive_summary(user_segments): """ 生成执行摘要 """ total_users = sum(len(segment) for segment in user_segments.values()) summary = f""" 🎯 用户行为分析执行摘要 ======================== 用户分布概览： • 总分析用户数：{total_users:,} 人 • 高价值用户：{len(user_segments['champions']):,} 人 ({len(user_segments['champions'])/total_users:.1%}) • 流失风险用户：{len(user_segments['at_risk']):,} 人 ({len(user_segments['at_risk'])/total_users:.1%}) • 新用户：{len(user_segments['new_customers']):,} 人 ({len(user_segments['new_customers'])/total_users:.1%}) 关键发现： {extract_key_findings(user_segments)} """ return summary def create_visualizations(insights): """ 创建可视化图表 """ visualization_paths = {} try: # 1. 用户分群分布图 plt.figure(figsize=(12, 8)) # 用户分群饼图 segments = insights['segment_analysis']['segments'] segment_names = list(segments.keys()) segment_sizes = [segments[name]['count'] for name in segment_names] plt.subplot(2, 2, 1) plt.pie(segment_sizes, labels=segment_names, autopct='%1.1f%%', startangle=90) plt.title('用户分群分布') # 2. RFM得分分布热力图 plt.subplot(2, 2, 2) rfm_data = insights['segment_analysis']['rfm_distribution'] sns.heatmap(rfm_data, annot=True, cmap='YlOrRd') plt.title('RFM得分分布热力图') # 3. 行为模式关联图 plt.subplot(2, 2, 3) patterns = insights['behavior_patterns']['association_rules'] # 创建关联规则可视化 # ... 可视化代码 # 4. 预测分数分布 plt.subplot(2, 2, 4) predictions = insights['prediction_insights']['propensity_scores'] scores = [p['final_score'] for p in predictions.values()] plt.hist(scores, bins=20, alpha=0.7, color='skyblue') plt.title('用户购买倾向分布') plt.xlabel('倾向分数') plt.ylabel('用户数量') plt.tight_layout() # 保存图表 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") viz_path = f"./visualizations/user_behavior_analysis_{timestamp}.png" plt.savefig(viz_path, dpi=300, bbox_inches='tight') plt.close() visualization_paths['main_dashboard'] = viz_path logging.info(f"📊 可视化图表已生成: {viz_path}") except Exception as e: logging.error(f"生成可视化图表失败: {str(e)}") return visualization_paths def generate_recommendations(user_segments, predictions): """ 生成 actionable 推荐 """ recommendations = [] # 高价值用户维护策略 if user_segments['champions']: recommendations.append({ 'target': 'champions', 'action': 'VIP专属服务', 'description': f"为 {len(user_segments['champions'])} 名高价值用户提供专属客服和优先发货', 'priority': 'high' }) # 流失风险用户挽回策略 if user_segments['at_risk']: recommendations.append({ 'target': 'at_risk', 'action': '定向优惠券投放', 'description': f"向 {len(user_segments['at_risk'])} 名流失风险用户发送专属挽回优惠券', 'priority': 'high' }) # 新用户转化策略 if user_segments['new_customers']: recommendations.append({ 'target': 'new_customers', 'action': '新客专属礼包', 'description': f"为 {len(user_segments['new_customers'])} 名新用户提供首单优惠和指导', 'priority': 'medium' }) # 基于预测的个性化推荐 high_propensity_users = [uid for uid, score in predictions.items() if score['final_score'] > 0.7] if high_propensity_users: recommendations.append({ 'target': 'high_propensity', 'action': '精准商品推荐', 'description': f"向 {len(high_propensity_users)} 名高购买倾向用户推送个性化商品', 'priority': 'medium' }) return recommendations

四、效果展示：数字说话

实施这个RPA行为分析方案后，效果简直泰酷辣！

4.1 效率对比数据

指标	人工分析	RPA+AI分析	提升效果
分析速度	8-10小时/次	5-10分钟/次	效率提升100倍⚡
数据覆盖	抽样分析	全量分析	覆盖率提升10倍
洞察深度	基础统计	机器学习深度洞察	价值提升50倍
实时性	周/月报	实时分析	时效性提升100倍

4.2 业务价值体现

精准营销：用户分群准确率提升60%，营销ROI提升40%
流失预警：提前30天识别流失风险，挽回率提升35%
个性化推荐：推荐点击率提升25%，转化率提升20%
决策支持：数据驱动决策，运营效率提升50%

五、避坑指南与实践经验

5.1 常见问题解决方案

1. 数据质量处理

def clean_user_behavior_data(raw_data): """ 清洗用户行为数据 """ cleaned_data = {} for user_id, data in raw_data.items(): # 处理缺失值 cleaned_record = { 'order_count': data.get('order_count', 0), 'total_spent': data.get('total_spent', 0), 'recency_days': min(data.get('recency_days', 365), 365), # 限制最大值为365天 'browse_frequency': data.get('browse_frequency', 0), 'avg_session_duration': data.get('avg_session_duration', 0), 'search_count': data.get('search_count', 0) } # 处理异常值 if cleaned_record['total_spent'] > 100000: # 单用户消费超过10万视为异常 cleaned_record['total_spent'] = 100000 cleaned_data[user_id] = cleaned_record return cleaned_data

2. 模型稳定性保障

def ensure_model_stability(training_data, min_samples=100): """ 确保模型训练稳定性 """ if len(training_data) < min_samples: logging.warning(f"训练数据不足 ({len(training_data)} < {min_samples})，使用简单规则代替") return SimpleRuleBasedPredictor() # 数据均衡性检查 class_distribution = np.bincount(training_data.labels) if np.min(class_distribution) / np.sum(class_distribution) < 0.1: logging.warning("数据类别不均衡，应用过采样技术") return apply_oversampling(training_data) return train_complex_model(training_data)

3. 性能优化策略

def optimize_analysis_performance(user_data, sampling_ratio=0.1): """ 优化分析性能 """ if len(user_data) > 100000: # 用户数超过10万时进行抽样 sampled_users = random.sample(list(user_data.keys()), int(len(user_data) * sampling_ratio)) sampled_data = {uid: user_data[uid] for uid in sampled_users} logging.info(f"数据量过大，使用 {sampling_ratio:.1%} 抽样进行分析") return sampled_data return user_data