vault_audit_tools/commands/
entity_churn.rs

1//! Multi-day entity churn analysis with intelligent ephemeral pattern detection.
2//!
3//! ⚠️ **DEPRECATED**: Use `entity-analysis churn` instead.
4//!
5//! ```bash
6//! # Old (deprecated):
7//! vault-audit entity-churn day1.log day2.log day3.log
8//!
9//! # New (recommended):
10//! vault-audit entity-analysis churn day1.log day2.log day3.log
11//! ```
12//!
13//! See [`entity_analysis`](crate::commands::entity_analysis) for the unified command.
14//!
15//! ---
16//!
17//! Tracks entity lifecycle across multiple audit log files (compressed or uncompressed)
18//! to identify:
19//! - New entities appearing each day
20//! - Returning vs. churned entities
21//! - Entity persistence patterns
22//! - Authentication method usage trends
23//! - **Ephemeral entities** using data-driven pattern learning
24//!
25//! # Usage
26//!
27//! ```bash
28//! # Analyze entity churn across a week (compressed files)
29//! vault-audit entity-churn day1.log.gz day2.log.gz day3.log.gz day4.log.gz day5.log.gz day6.log.gz day7.log.gz
30//!
31//! # With baseline for accurate new entity detection
32//! vault-audit entity-churn *.log --baseline baseline_entities.json
33//!
34//! # With entity mappings for enriched display names
35//! vault-audit entity-churn *.log --baseline baseline.json --entity-map entity_mappings.json
36//!
37//! # Export detailed churn data with ephemeral analysis
38//! vault-audit entity-churn *.log --output entity_churn.json
39//!
40//! # Export as CSV format
41//! vault-audit entity-churn *.log --output entity_churn.csv --format csv
42//! ```
43//!
44//! **Compressed File Support**: Automatically handles `.gz` and `.zst` files - no manual
45//! decompression required. Mix compressed and uncompressed files freely.
46//!
47//! # Ephemeral Pattern Detection
48//!
49//! The command uses a sophisticated two-pass analysis to detect ephemeral entities
50//! (e.g., CI/CD pipeline entities, temporary build entities) with confidence scoring:
51//!
52//! **Pass 1: Data Collection**
53//! - Track all entities across log files
54//! - Record first/last seen times and files
55//! - Count login activity per entity
56//!
57//! **Pass 2: Pattern Learning & Classification**
58//! - Learn patterns from entities that appeared 1-2 days
59//! - Identify naming patterns (e.g., `github-repo:org/repo:ref:branch`)
60//! - Calculate confidence scores (0.0-1.0) based on:
61//!   - Days active (1 day = high confidence, 2 days = medium)
62//!   - Similar entities on same mount path
63//!   - Activity levels (low login counts)
64//!   - Gaps in activity (reduces confidence for sporadic access)
65//!
66//! # Output
67//!
68//! ## Entity Lifecycle Classification:
69//! - **new_day_N**: Entities first seen on day N (not in baseline)
70//! - **pre_existing_baseline**: Entities that existed before analysis period
71//!
72//! ## Activity Patterns:
73//! - **consistent**: Appeared in most/all log files
74//! - **sporadic**: Appeared intermittently with gaps
75//! - **declining**: Activity decreased over time
76//! - **single_burst**: Appeared only once
77//!
78//! ## Ephemeral Detection:
79//! - Confidence levels: High (≥70%), Medium (50-69%), Low (40-49%)
80//! - Detailed reasoning for each classification
81//! - Top ephemeral entities by confidence
82//! - Pattern statistics and mount path analysis
83//!
84//! # JSON Output Fields
85//!
86//! When using `--output`, each entity record includes:
87//! - `entity_id`: Vault entity identifier
88//! - `display_name`: Human-readable name
89//! - `first_seen_file` / `first_seen_time`: When first observed
90//! - `last_seen_file` / `last_seen_time`: When last observed
91//! - `files_appeared`: List of log files entity was active in
92//! - `total_logins`: Total login count across all files
93//! - `lifecycle`: Entity lifecycle classification
94//! - `activity_pattern`: Behavioral pattern classification
95//! - `is_ephemeral_pattern`: Boolean flag for ephemeral detection
96//! - `ephemeral_confidence`: Confidence score (0.0-1.0)
97//! - `ephemeral_reasons`: Array of human-readable reasons
98//!
99//! Only tracks entities that performed login operations (paths ending in `/login`).
100
101use crate::audit::types::AuditEntry;
102use crate::utils::progress::ProgressBar;
103use crate::utils::reader::open_file;
104use anyhow::{Context, Result};
105use chrono::{DateTime, Utc};
106use serde::{Deserialize, Serialize};
107use std::collections::{HashMap, HashSet};
108use std::fs::File;
109use std::io::{BufRead, BufReader};
110use std::path::Path;
111
112/// Entity mapping from baseline CSV files
113#[derive(Debug, Serialize, Deserialize)]
114struct EntityMapping {
115    display_name: String,
116    mount_path: String,
117    #[allow(dead_code)]
118    mount_accessor: String,
119    #[allow(dead_code)]
120    login_count: usize,
121    #[allow(dead_code)]
122    first_seen: String,
123    #[allow(dead_code)]
124    last_seen: String,
125}
126
127/// Represents an entity's churn status
128#[derive(Debug, Serialize, Clone)]
129struct EntityChurnRecord {
130    entity_id: String,
131    display_name: String,
132    mount_path: String,
133    mount_type: String,
134    token_type: String,
135    first_seen_file: String,
136    first_seen_time: DateTime<Utc>,
137    last_seen_file: String,
138    last_seen_time: DateTime<Utc>,
139    files_appeared: Vec<String>,
140    total_logins: usize,
141    lifecycle: String, // "new_day_1", "new_day_2", "new_day_3", "pre_existing"
142    activity_pattern: String, // "consistent", "sporadic", "declining", "single_burst", "unknown"
143    is_ephemeral_pattern: bool,
144    ephemeral_confidence: f32, // 0.0 to 1.0
145    ephemeral_reasons: Vec<String>,
146    // Baseline metadata (if entity existed in baseline)
147    #[serde(skip_serializing_if = "Option::is_none")]
148    baseline_entity_name: Option<String>,
149    #[serde(skip_serializing_if = "Option::is_none")]
150    baseline_created: Option<String>,
151    #[serde(skip_serializing_if = "Option::is_none")]
152    baseline_alias_name: Option<String>,
153    #[serde(skip_serializing_if = "Option::is_none")]
154    baseline_mount_path: Option<String>,
155    // Entity-map metadata (from historical audit logs via preprocess-entities)
156    #[serde(skip_serializing_if = "Option::is_none")]
157    historical_display_name: Option<String>,
158    #[serde(skip_serializing_if = "Option::is_none")]
159    historical_first_seen: Option<String>,
160    #[serde(skip_serializing_if = "Option::is_none")]
161    historical_last_seen: Option<String>,
162    #[serde(skip_serializing_if = "Option::is_none")]
163    historical_login_count: Option<usize>,
164}
165
166/// CSV-compatible representation of entity churn record
167#[derive(Debug, Serialize)]
168struct EntityChurnRecordCsv {
169    entity_id: String,
170    display_name: String,
171    mount_path: String,
172    mount_type: String,
173    token_type: String,
174    first_seen_file: String,
175    first_seen_time: String,
176    last_seen_file: String,
177    last_seen_time: String,
178    files_appeared: String, // Comma-separated list
179    days_active: usize,
180    total_logins: usize,
181    lifecycle: String,
182    activity_pattern: String,
183    is_ephemeral_pattern: bool,
184    ephemeral_confidence: f32,
185    ephemeral_reasons: String, // Semicolon-separated list
186    baseline_entity_name: String,
187    baseline_created: String,
188    baseline_alias_name: String,
189    baseline_mount_path: String,
190    historical_display_name: String,
191    historical_first_seen: String,
192    historical_last_seen: String,
193    historical_login_count: String,
194}
195
196impl From<EntityChurnRecord> for EntityChurnRecordCsv {
197    fn from(record: EntityChurnRecord) -> Self {
198        EntityChurnRecordCsv {
199            entity_id: record.entity_id,
200            display_name: record.display_name,
201            mount_path: record.mount_path,
202            mount_type: record.mount_type,
203            token_type: record.token_type,
204            first_seen_file: record.first_seen_file,
205            first_seen_time: record.first_seen_time.to_rfc3339(),
206            last_seen_file: record.last_seen_file,
207            last_seen_time: record.last_seen_time.to_rfc3339(),
208            files_appeared: record.files_appeared.join(", "),
209            days_active: record.files_appeared.len(),
210            total_logins: record.total_logins,
211            lifecycle: record.lifecycle,
212            activity_pattern: record.activity_pattern,
213            is_ephemeral_pattern: record.is_ephemeral_pattern,
214            ephemeral_confidence: record.ephemeral_confidence,
215            ephemeral_reasons: record.ephemeral_reasons.join("; "),
216            baseline_entity_name: record.baseline_entity_name.unwrap_or_default(),
217            baseline_created: record.baseline_created.unwrap_or_default(),
218            baseline_alias_name: record.baseline_alias_name.unwrap_or_default(),
219            baseline_mount_path: record.baseline_mount_path.unwrap_or_default(),
220            historical_display_name: record.historical_display_name.unwrap_or_default(),
221            historical_first_seen: record.historical_first_seen.unwrap_or_default(),
222            historical_last_seen: record.historical_last_seen.unwrap_or_default(),
223            historical_login_count: record
224                .historical_login_count
225                .map(|n| n.to_string())
226                .unwrap_or_default(),
227        }
228    }
229}
230
231#[derive(Debug)]
232struct DailyStats {
233    #[allow(dead_code)]
234    file_name: String,
235    new_entities: usize,
236    returning_entities: usize,
237    total_logins: usize,
238}
239
240/// Analyzes entity behavior patterns to detect ephemeral entities
241#[derive(Debug)]
242struct EphemeralPatternAnalyzer {
243    total_files: usize,
244    short_lived_patterns: Vec<ShortLivedPattern>,
245}
246
247#[derive(Debug)]
248struct ShortLivedPattern {
249    days_active: usize,
250    display_name: String,
251    mount_path: String,
252}
253
254impl EphemeralPatternAnalyzer {
255    fn new(total_files: usize) -> Self {
256        Self {
257            total_files,
258            short_lived_patterns: Vec::new(),
259        }
260    }
261
262    /// Learn patterns from entities that appeared 1-2 days (potential ephemeral patterns)
263    fn learn_from_entities(&mut self, entities: &HashMap<String, EntityChurnRecord>) {
264        for entity in entities.values() {
265            let days_active = entity.files_appeared.len();
266
267            // Learn from entities that appeared 1-2 days only
268            if days_active <= 2 {
269                self.short_lived_patterns.push(ShortLivedPattern {
270                    days_active,
271                    display_name: entity.display_name.clone(),
272                    mount_path: entity.mount_path.clone(),
273                });
274            }
275        }
276    }
277
278    /// Analyze an entity and determine if it matches ephemeral patterns
279    fn analyze_entity(&self, entity: &EntityChurnRecord) -> (bool, f32, Vec<String>) {
280        let days_active = entity.files_appeared.len();
281        let mut confidence = 0.0;
282        let mut reasons = Vec::new();
283
284        // Strong indicators (high confidence)
285        if days_active == 1 {
286            confidence += 0.5;
287            reasons.push(format!("Appeared only 1 day ({})", entity.first_seen_file));
288        } else if days_active == 2 {
289            confidence += 0.3;
290            reasons.push(format!(
291                "Appeared only 2 days: {}, {}",
292                entity.files_appeared.first().unwrap_or(&String::new()),
293                entity.files_appeared.last().unwrap_or(&String::new())
294            ));
295        }
296
297        // Pattern matching: Check if display name follows patterns seen in other short-lived entities
298        if days_active <= 2 {
299            // Count how many other short-lived entities share similar patterns
300            let similar_count = self
301                .short_lived_patterns
302                .iter()
303                .filter(|p| {
304                    // Same mount path
305                    if p.mount_path == entity.mount_path && p.days_active <= 2 {
306                        return true;
307                    }
308                    // Similar naming pattern (e.g., github-repo:* or airflow-*)
309                    if entity.display_name.contains(':') && p.display_name.contains(':') {
310                        let entity_prefix = entity.display_name.split(':').next().unwrap_or("");
311                        let pattern_prefix = p.display_name.split(':').next().unwrap_or("");
312                        if entity_prefix == pattern_prefix && !entity_prefix.is_empty() {
313                            return true;
314                        }
315                    }
316                    false
317                })
318                .count();
319
320            if similar_count > 5 {
321                confidence += 0.2;
322                reasons.push(format!(
323                    "Matches pattern seen in {} other short-lived entities",
324                    similar_count
325                ));
326            } else if similar_count > 0 {
327                confidence += 0.1;
328                reasons.push(format!(
329                    "Similar to {} other short-lived entities",
330                    similar_count
331                ));
332            }
333        }
334
335        // Low activity indicator
336        if entity.total_logins <= 5 && days_active <= 2 {
337            confidence += 0.1;
338            reasons.push(format!(
339                "Low activity: only {} login(s)",
340                entity.total_logins
341            ));
342        }
343
344        // Non-continuous appearance (sporadic pattern suggests not churned, just periodic)
345        if days_active >= 2 {
346            let first_day_idx = entity.files_appeared.first().and_then(|f| {
347                f.split('_')
348                    .next_back()
349                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
350            });
351            let last_day_idx = entity.files_appeared.last().and_then(|f| {
352                f.split('_')
353                    .next_back()
354                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
355            });
356
357            if let (Some(first), Some(last)) = (first_day_idx, last_day_idx) {
358                let span = last - first + 1;
359                if span > days_active {
360                    // Gaps in activity - reduce confidence
361                    confidence *= 0.7;
362                    reasons.push(
363                        "Has gaps in activity (possibly sporadic access, not churned)".to_string(),
364                    );
365                }
366            }
367        }
368
369        // Cap confidence and determine ephemeral status
370        confidence = f32::min(confidence, 1.0);
371        let is_ephemeral = confidence >= 0.4; // Threshold for classification
372
373        // Add absence indicator if not seen in recent files
374        if is_ephemeral && days_active < self.total_files {
375            reasons.push(format!(
376                "Not seen in most recent {} file(s)",
377                self.total_files - days_active
378            ));
379        }
380
381        (is_ephemeral, confidence, reasons)
382    }
383
384    /// Determine activity pattern based on appearance across files
385    fn classify_activity_pattern(&self, entity: &EntityChurnRecord) -> String {
386        let days_active = entity.files_appeared.len();
387
388        if days_active == 1 {
389            return "single_burst".to_string();
390        }
391
392        if days_active == self.total_files {
393            return "consistent".to_string();
394        }
395
396        if days_active >= (self.total_files * 2) / 3 {
397            return "consistent".to_string();
398        }
399
400        // Check if activity is declining (appeared early but stopped)
401        if let (Some(_first_file), Some(last_file)) =
402            (entity.files_appeared.first(), entity.files_appeared.last())
403        {
404            // Simple heuristic: if last seen was in first half of files, it's declining
405            let last_file_num = last_file
406                .split('_')
407                .next_back()
408                .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
409                .unwrap_or(self.total_files);
410
411            if last_file_num < self.total_files / 2 {
412                return "declining".to_string();
413            }
414        }
415
416        if days_active <= 2 {
417            return "single_burst".to_string();
418        }
419
420        "sporadic".to_string()
421    }
422}
423
424fn format_number(n: usize) -> String {
425    let s = n.to_string();
426    let mut result = String::new();
427    for (i, c) in s.chars().rev().enumerate() {
428        if i > 0 && i % 3 == 0 {
429            result.push(',');
430        }
431        result.push(c);
432    }
433    result.chars().rev().collect()
434}
435
436fn get_file_size(path: &str) -> Result<u64> {
437    Ok(std::fs::metadata(path)?.len())
438}
439
440fn load_entity_mappings(path: &str) -> Result<HashMap<String, EntityMapping>> {
441    let file = File::open(path).context("Failed to open entity map file")?;
442    let mappings: HashMap<String, EntityMapping> =
443        serde_json::from_reader(file).context("Failed to parse entity map JSON")?;
444    Ok(mappings)
445}
446
447#[derive(Debug, Deserialize, Clone)]
448#[allow(dead_code)]
449struct BaselineEntity {
450    entity_id: String,
451    // Fields from entity-list (Vault API) - full metadata
452    #[serde(default)]
453    entity_name: String,
454    #[serde(default)]
455    entity_disabled: bool,
456    #[serde(default)]
457    entity_created: String,
458    #[serde(default)]
459    entity_updated: String,
460    #[serde(default)]
461    alias_id: String,
462    #[serde(default)]
463    alias_name: String,
464    #[serde(default)]
465    mount_path: String,
466    #[serde(default)]
467    mount_type: String,
468    #[serde(default)]
469    mount_accessor: String,
470    #[serde(default)]
471    alias_created: String,
472    #[serde(default)]
473    alias_updated: String,
474    #[serde(default)]
475    alias_metadata: String,
476}
477
478impl BaselineEntity {
479    /// Get the best available name (entity_name if available, otherwise alias_name)
480    fn get_name(&self) -> String {
481        if !self.entity_name.is_empty() {
482            self.entity_name.clone()
483        } else if !self.alias_name.is_empty() {
484            self.alias_name.clone()
485        } else {
486            String::new()
487        }
488    }
489
490    /// Get the entity creation time
491    fn get_created(&self) -> String {
492        self.entity_created.clone()
493    }
494}
495
496fn load_baseline_entities(path: &str) -> Result<HashMap<String, BaselineEntity>> {
497    let file = File::open(path).context("Failed to open baseline entities file")?;
498
499    // Check if it's JSON or CSV based on file extension
500    let path_lower = path.to_lowercase();
501    if path_lower.ends_with(".json") {
502        // JSON format from entity-list with --format json
503        let entities: Vec<BaselineEntity> =
504            serde_json::from_reader(file).context("Failed to parse baseline entities JSON")?;
505        Ok(entities
506            .into_iter()
507            .map(|e| (e.entity_id.clone(), e))
508            .collect())
509    } else {
510        // CSV format (default entity-list output)
511        let mut reader = csv::Reader::from_reader(file);
512        let mut entities = HashMap::new();
513
514        for result in reader.deserialize() {
515            let entity: BaselineEntity = result.context("Failed to parse baseline CSV row")?;
516            // Use first occurrence of each entity_id (entities can have multiple aliases)
517            entities.entry(entity.entity_id.clone()).or_insert(entity);
518        }
519
520        Ok(entities)
521    }
522}
523
524pub fn run(
525    log_files: &[String],
526    entity_map: Option<&str>,
527    baseline_entities: Option<&str>,
528    output: Option<&str>,
529    format: Option<&str>,
530) -> Result<()> {
531    println!("\n=== Multi-Day Entity Churn Analysis ===\n");
532    println!("Analyzing {} log files:", log_files.len());
533    for (i, file) in log_files.iter().enumerate() {
534        let size = get_file_size(file)?;
535        println!(
536            "  Day {}: {} ({:.2} GB)",
537            i + 1,
538            file,
539            size as f64 / 1_000_000_000.0
540        );
541    }
542    println!();
543
544    // Load baseline entities if provided
545    let baseline = if let Some(path) = baseline_entities {
546        println!(
547            "Loading baseline entity list (Vault API metadata) from {}...",
548            path
549        );
550        let baseline_set = load_baseline_entities(path)?;
551        println!(
552            "Loaded {} pre-existing entities from Vault API baseline",
553            format_number(baseline_set.len())
554        );
555        println!();
556        Some(baseline_set)
557    } else {
558        println!("No baseline entity list provided. Cannot distinguish truly NEW entities from pre-existing.");
559        println!("   All Day 1 entities will be marked as 'pre_existing_or_new_day_1'.");
560        println!("   To get accurate results, run: ./vault-audit entity-list --output baseline_entities.json\n");
561        None
562    };
563
564    // Load entity mappings if provided (historical data from audit logs)
565    let entity_mappings = if let Some(path) = entity_map {
566        println!(
567            "Loading historical entity mappings (audit log enrichment) from {}...",
568            path
569        );
570        let mappings = load_entity_mappings(path)?;
571        println!(
572            "Loaded {} entity mappings with historical audit log data",
573            format_number(mappings.len())
574        );
575        println!();
576        Some(mappings)
577    } else {
578        None
579    };
580
581    // Track all entities across all files
582    let mut entities: HashMap<String, EntityChurnRecord> = HashMap::new();
583    let mut daily_stats: Vec<DailyStats> = Vec::new();
584
585    // Process each log file in order
586    for (file_idx, log_file) in log_files.iter().enumerate() {
587        let file_name = Path::new(log_file)
588            .file_name()
589            .unwrap()
590            .to_string_lossy()
591            .to_string();
592
593        println!("\nProcessing Day {} ({})...", file_idx + 1, file_name);
594
595        let file = open_file(log_file)
596            .with_context(|| format!("Failed to open log file: {}", log_file))?;
597        let file_size = get_file_size(log_file)? as usize;
598
599        let reader = BufReader::new(file);
600        let mut progress = ProgressBar::new(file_size, "Processing");
601
602        let mut new_entities_this_file = 0;
603        let mut returning_entities_this_file = HashSet::new();
604        let mut logins_this_file = 0;
605        let mut bytes_processed = 0;
606
607        for line in reader.lines() {
608            let line = line.context("Failed to read line from log file")?;
609            bytes_processed += line.len() + 1; // +1 for newline
610
611            // Update progress periodically
612            if bytes_processed % 10_000 == 0 {
613                progress.update(bytes_processed.min(file_size));
614            }
615
616            let trimmed = line.trim();
617            if trimmed.is_empty() {
618                continue;
619            }
620
621            let entry: AuditEntry = match serde_json::from_str(trimmed) {
622                Ok(e) => e,
623                Err(_) => continue,
624            };
625
626            // Only process login operations (auth paths ending in /login)
627            let Some(ref request) = entry.request else {
628                continue;
629            };
630            let Some(ref path) = request.path else {
631                continue;
632            };
633            if !path.ends_with("/login") {
634                continue;
635            }
636
637            logins_this_file += 1;
638
639            // Extract entity info
640            let Some(ref auth) = entry.auth else {
641                continue;
642            };
643            let Some(ref entity_id) = auth.entity_id else {
644                continue;
645            };
646
647            let display_name = auth
648                .display_name
649                .clone()
650                .unwrap_or_else(|| entity_id.clone());
651            let mount_path = request.path.clone().unwrap_or_default();
652            let mount_type = request.mount_type.clone().unwrap_or_default();
653            let token_type = auth.token_type.clone().unwrap_or_default();
654
655            // Parse timestamp
656            let first_seen_time = chrono::DateTime::parse_from_rfc3339(&entry.time)
657                .ok()
658                .map(|dt| dt.with_timezone(&Utc))
659                .unwrap_or_else(Utc::now);
660
661            // Check if this entity exists from a previous file
662            if let Some(entity_record) = entities.get_mut(entity_id) {
663                // Returning entity
664                entity_record.total_logins += 1;
665                entity_record.last_seen_file = file_name.clone();
666                entity_record.last_seen_time = first_seen_time;
667                if !entity_record.files_appeared.contains(&file_name) {
668                    entity_record.files_appeared.push(file_name.clone());
669                }
670                returning_entities_this_file.insert(entity_id.clone());
671            } else {
672                // New entity (first time across all files)
673                new_entities_this_file += 1;
674
675                // Determine lifecycle based on baseline and which file this is
676                let lifecycle = if let Some(ref baseline_set) = baseline {
677                    if baseline_set.contains_key(entity_id) {
678                        "pre_existing_baseline".to_string()
679                    } else {
680                        // Not in baseline, so truly NEW during analysis period
681                        match file_idx {
682                            0 => "new_day_1".to_string(),
683                            1 => "new_day_2".to_string(),
684                            2 => "new_day_3".to_string(),
685                            _ => format!("new_day_{}", file_idx + 1),
686                        }
687                    }
688                } else {
689                    // No baseline provided, can't distinguish
690                    match file_idx {
691                        0 => "pre_existing_or_new_day_1".to_string(),
692                        1 => "new_day_2".to_string(),
693                        2 => "new_day_3".to_string(),
694                        _ => format!("new_day_{}", file_idx + 1),
695                    }
696                };
697
698                // Get baseline metadata if entity exists in baseline
699                let (
700                    baseline_entity_name,
701                    baseline_created,
702                    baseline_alias_name,
703                    baseline_mount_path,
704                ) = if let Some(ref baseline_map) = baseline {
705                    if let Some(baseline_entity) = baseline_map.get(entity_id) {
706                        let name = baseline_entity.get_name();
707                        let created = baseline_entity.get_created();
708                        (
709                            if !name.is_empty() { Some(name) } else { None },
710                            if !created.is_empty() {
711                                Some(created)
712                            } else {
713                                None
714                            },
715                            if !baseline_entity.alias_name.is_empty() {
716                                Some(baseline_entity.alias_name.clone())
717                            } else {
718                                None
719                            },
720                            if !baseline_entity.mount_path.is_empty() {
721                                Some(baseline_entity.mount_path.clone())
722                            } else {
723                                None
724                            },
725                        )
726                    } else {
727                        (None, None, None, None)
728                    }
729                } else {
730                    (None, None, None, None)
731                };
732
733                // Fetch historical data from entity_mappings
734                let (
735                    historical_display_name,
736                    historical_first_seen,
737                    historical_last_seen,
738                    historical_login_count,
739                ) = if let Some(ref mappings) = entity_mappings {
740                    if let Some(mapping) = mappings.get(entity_id) {
741                        (
742                            Some(mapping.display_name.clone()),
743                            Some(mapping.first_seen.clone()),
744                            Some(mapping.last_seen.clone()),
745                            Some(mapping.login_count),
746                        )
747                    } else {
748                        (None, None, None, None)
749                    }
750                } else {
751                    (None, None, None, None)
752                };
753
754                entities.insert(
755                    entity_id.clone(),
756                    EntityChurnRecord {
757                        entity_id: entity_id.clone(),
758                        display_name: display_name.clone(),
759                        mount_path: mount_path.clone(),
760                        mount_type: mount_type.clone(),
761                        token_type: token_type.clone(),
762                        first_seen_file: file_name.clone(),
763                        first_seen_time,
764                        last_seen_file: file_name.clone(),
765                        last_seen_time: first_seen_time,
766                        files_appeared: vec![file_name.clone()],
767                        total_logins: 1,
768                        lifecycle,
769                        activity_pattern: "unknown".to_string(), // Will be computed in second pass
770                        is_ephemeral_pattern: false,             // Will be computed in second pass
771                        ephemeral_confidence: 0.0,               // Will be computed in second pass
772                        ephemeral_reasons: Vec::new(),           // Will be computed in second pass
773                        baseline_entity_name,
774                        baseline_created,
775                        baseline_alias_name,
776                        baseline_mount_path,
777                        historical_display_name,
778                        historical_first_seen,
779                        historical_last_seen,
780                        historical_login_count,
781                    },
782                );
783            }
784        }
785
786        progress.finish();
787
788        daily_stats.push(DailyStats {
789            file_name,
790            new_entities: new_entities_this_file,
791            returning_entities: returning_entities_this_file.len(),
792            total_logins: logins_this_file,
793        });
794
795        println!(
796            "Day {} Summary: {} new entities, {} returning, {} logins",
797            file_idx + 1,
798            format_number(new_entities_this_file),
799            format_number(returning_entities_this_file.len()),
800            format_number(logins_this_file)
801        );
802    }
803
804    // === SECOND PASS: Analyze patterns and classify entities ===
805    println!("\nAnalyzing entity behavior patterns...");
806
807    let mut analyzer = EphemeralPatternAnalyzer::new(log_files.len());
808
809    // Step 1: Learn patterns from short-lived entities
810    analyzer.learn_from_entities(&entities);
811    println!(
812        "Learned from {} short-lived entity patterns",
813        format_number(analyzer.short_lived_patterns.len())
814    );
815
816    // Step 2: Classify all entities using learned patterns
817    let entity_ids: Vec<String> = entities.keys().cloned().collect();
818    for entity_id in entity_ids {
819        if let Some(entity) = entities.get_mut(&entity_id) {
820            // Classify activity pattern
821            entity.activity_pattern = analyzer.classify_activity_pattern(entity);
822
823            // Analyze for ephemeral patterns
824            let (is_ephemeral, confidence, reasons) = analyzer.analyze_entity(entity);
825            entity.is_ephemeral_pattern = is_ephemeral;
826            entity.ephemeral_confidence = confidence;
827            entity.ephemeral_reasons = reasons;
828        }
829    }
830
831    // Generate final report
832    println!("\n=== Entity Churn Analysis ===\n");
833
834    println!("Daily Breakdown:");
835    for (idx, stats) in daily_stats.iter().enumerate() {
836        println!(
837            "  Day {}: {} new, {} returning, {} total logins",
838            idx + 1,
839            format_number(stats.new_entities),
840            format_number(stats.returning_entities),
841            format_number(stats.total_logins)
842        );
843    }
844
845    // Lifecycle classification
846    let mut lifecycle_counts: HashMap<String, usize> = HashMap::new();
847    let mut entities_by_file_count: HashMap<usize, usize> = HashMap::new();
848
849    for entity in entities.values() {
850        *lifecycle_counts
851            .entry(entity.lifecycle.clone())
852            .or_insert(0) += 1;
853        *entities_by_file_count
854            .entry(entity.files_appeared.len())
855            .or_insert(0) += 1;
856    }
857
858    println!("\nEntity Lifecycle Classification:");
859    let mut lifecycle_vec: Vec<_> = lifecycle_counts.iter().collect();
860    lifecycle_vec.sort_by_key(|(k, _)| *k);
861    for (lifecycle, count) in lifecycle_vec {
862        println!("  {}: {}", lifecycle, format_number(*count));
863    }
864
865    println!("\nEntity Persistence:");
866    for day_count in 1..=log_files.len() {
867        if let Some(count) = entities_by_file_count.get(&day_count) {
868            let label = if day_count == 1 {
869                "Appeared 1 day only"
870            } else if day_count == log_files.len() {
871                "Appeared all days (persistent)"
872            } else {
873                "Appeared some days"
874            };
875            println!(
876                "  {} day(s): {} entities ({})",
877                day_count,
878                format_number(*count),
879                label
880            );
881        }
882    }
883
884    // Activity pattern analysis
885    let mut activity_pattern_counts: HashMap<String, usize> = HashMap::new();
886    let mut ephemeral_entities = Vec::new();
887
888    for entity in entities.values() {
889        *activity_pattern_counts
890            .entry(entity.activity_pattern.clone())
891            .or_insert(0) += 1;
892
893        if entity.is_ephemeral_pattern {
894            ephemeral_entities.push(entity.clone());
895        }
896    }
897
898    println!("\nActivity Pattern Distribution:");
899    let mut pattern_vec: Vec<_> = activity_pattern_counts.iter().collect();
900    pattern_vec.sort_by(|a, b| b.1.cmp(a.1));
901    for (pattern, count) in pattern_vec {
902        println!("  {}: {}", pattern, format_number(*count));
903    }
904
905    println!("\nEphemeral Entity Detection:");
906    println!(
907        "  Detected {} likely ephemeral entities (confidence ≥ 0.4)",
908        format_number(ephemeral_entities.len())
909    );
910
911    if !ephemeral_entities.is_empty() {
912        // Sort by confidence
913        ephemeral_entities.sort_by(|a, b| {
914            b.ephemeral_confidence
915                .partial_cmp(&a.ephemeral_confidence)
916                .unwrap_or(std::cmp::Ordering::Equal)
917        });
918
919        println!("  Top 10 by confidence:");
920        for (idx, entity) in ephemeral_entities.iter().take(10).enumerate() {
921            println!(
922                "    {}. {} (confidence: {:.1}%)",
923                idx + 1,
924                entity.display_name,
925                entity.ephemeral_confidence * 100.0
926            );
927            for reason in &entity.ephemeral_reasons {
928                println!("       - {}", reason);
929            }
930        }
931
932        // Breakdown by confidence ranges
933        let high_conf = ephemeral_entities
934            .iter()
935            .filter(|e| e.ephemeral_confidence >= 0.7)
936            .count();
937        let med_conf = ephemeral_entities
938            .iter()
939            .filter(|e| e.ephemeral_confidence >= 0.5 && e.ephemeral_confidence < 0.7)
940            .count();
941        let low_conf = ephemeral_entities
942            .iter()
943            .filter(|e| e.ephemeral_confidence >= 0.4 && e.ephemeral_confidence < 0.5)
944            .count();
945
946        println!("\n  Confidence distribution:");
947        println!("    High (≥70%): {}", format_number(high_conf));
948        println!("    Medium (50-69%): {}", format_number(med_conf));
949        println!("    Low (40-49%): {}", format_number(low_conf));
950    }
951
952    // Mount path breakdown
953    let mut mount_stats: HashMap<String, (usize, String)> = HashMap::new();
954    for entity in entities.values() {
955        let entry = mount_stats
956            .entry(entity.mount_path.clone())
957            .or_insert((0, entity.mount_type.clone()));
958        entry.0 += 1;
959    }
960
961    println!("\nTop Authentication Methods (Total Entities):");
962    let mut mount_vec: Vec<_> = mount_stats.iter().collect();
963    mount_vec.sort_by(|a, b| b.1 .0.cmp(&a.1 .0));
964
965    for (idx, (path, (count, mount_type))) in mount_vec.iter().take(20).enumerate() {
966        println!(
967            "  {}. {} ({}): {}",
968            idx + 1,
969            path,
970            mount_type,
971            format_number(*count)
972        );
973    }
974
975    // Calculate GitHub duplication if present
976    let github_entities: Vec<_> = entities
977        .values()
978        .filter(|e| e.mount_path.contains("/github"))
979        .collect();
980
981    if !github_entities.is_empty() {
982        println!("\n=== GitHub Entity Analysis ===");
983        println!(
984            "Total GitHub entities: {}",
985            format_number(github_entities.len())
986        );
987
988        // Extract repo names and count duplicates
989        let mut repo_counts: HashMap<String, usize> = HashMap::new();
990        for entity in &github_entities {
991            // Extract repo from "github-repo:org/repo:..." pattern
992            if let Some(repo) = entity.display_name.split(':').nth(1) {
993                *repo_counts.entry(repo.to_string()).or_insert(0) += 1;
994            }
995        }
996
997        println!("Unique repositories: {}", format_number(repo_counts.len()));
998        println!("\nTop repositories by entity count:");
999        let mut repo_vec: Vec<_> = repo_counts.iter().collect();
1000        repo_vec.sort_by(|a, b| b.1.cmp(a.1));
1001
1002        for (idx, (repo, count)) in repo_vec.iter().take(20).enumerate() {
1003            if **count > 1 {
1004                println!(
1005                    "  {}. {}: {} entities",
1006                    idx + 1,
1007                    repo,
1008                    format_number(**count)
1009                );
1010            }
1011        }
1012    }
1013
1014    // Export to file if requested
1015    if let Some(output_path) = output {
1016        let mut entities_vec: Vec<_> = entities.into_values().collect();
1017        entities_vec.sort_by(|a, b| a.first_seen_time.cmp(&b.first_seen_time));
1018
1019        // Determine format from parameter or file extension
1020        let output_format = format.unwrap_or_else(|| {
1021            if output_path.ends_with(".csv") {
1022                "csv"
1023            } else {
1024                "json"
1025            }
1026        });
1027
1028        println!(
1029            "\nExporting detailed entity records to {} (format: {})...",
1030            output_path, output_format
1031        );
1032
1033        let output_file = File::create(output_path)
1034            .with_context(|| format!("Failed to create output file: {}", output_path))?;
1035
1036        match output_format {
1037            "csv" => {
1038                let mut writer = csv::Writer::from_writer(output_file);
1039                for entity in &entities_vec {
1040                    let csv_record: EntityChurnRecordCsv = entity.clone().into();
1041                    writer
1042                        .serialize(&csv_record)
1043                        .context("Failed to write CSV record")?;
1044                }
1045                writer.flush().context("Failed to flush CSV writer")?;
1046            }
1047            _ => {
1048                // Default to JSON
1049                serde_json::to_writer_pretty(output_file, &entities_vec)
1050                    .context("Failed to write JSON output")?;
1051            }
1052        }
1053
1054        println!(
1055            "Exported {} entity records",
1056            format_number(entities_vec.len())
1057        );
1058    }
1059
1060    println!("\n=== Analysis Complete ===\n");
1061    Ok(())
1062}