vault_audit_tools/commands/
entity_churn.rs

1//! Multi-day entity churn analysis with intelligent ephemeral pattern detection.
2//!
3//! ⚠️ **DEPRECATED**: Use `entity-analysis churn` instead.
4//!
5//! ```bash
6//! # Old (deprecated):
7//! vault-audit entity-churn day1.log day2.log day3.log
8//!
9//! # New (recommended):
10//! vault-audit entity-analysis churn day1.log day2.log day3.log
11//! ```
12//!
13//! See [`entity_analysis`](crate::commands::entity_analysis) for the unified command.
14//!
15//! ---
16//!
17//! Tracks entity lifecycle across multiple audit log files (compressed or uncompressed)
18//! to identify:
19//! - New entities appearing each day
20//! - Returning vs. churned entities
21//! - Entity persistence patterns
22//! - Authentication method usage trends
23//! - **Ephemeral entities** using data-driven pattern learning
24//!
25//! # Usage
26//!
27//! ```bash
28//! # Analyze entity churn across a week (compressed files)
29//! vault-audit entity-churn day1.log.gz day2.log.gz day3.log.gz day4.log.gz day5.log.gz day6.log.gz day7.log.gz
30//!
31//! # With baseline for accurate new entity detection
32//! vault-audit entity-churn *.log --baseline baseline_entities.json
33//!
34//! # With entity mappings for enriched display names
35//! vault-audit entity-churn *.log --baseline baseline.json --entity-map entity_mappings.json
36//!
37//! # Export detailed churn data with ephemeral analysis
38//! vault-audit entity-churn *.log --output entity_churn.json
39//!
40//! # Export as CSV format
41//! vault-audit entity-churn *.log --output entity_churn.csv --format csv
42//! ```
43//!
44//! **Compressed File Support**: Automatically handles `.gz` and `.zst` files - no manual
45//! decompression required. Mix compressed and uncompressed files freely.
46//!
47//! # Ephemeral Pattern Detection
48//!
49//! The command uses a sophisticated two-pass analysis to detect ephemeral entities
50//! (e.g., CI/CD pipeline entities, temporary build entities) with confidence scoring:
51//!
52//! **Pass 1: Data Collection**
53//! - Track all entities across log files
54//! - Record first/last seen times and files
55//! - Count login activity per entity
56//!
57//! **Pass 2: Pattern Learning & Classification**
58//! - Learn patterns from entities that appeared 1-2 days
59//! - Identify naming patterns (e.g., `github-repo:org/repo:ref:branch`)
60//! - Calculate confidence scores (0.0-1.0) based on:
61//!   - Days active (1 day = high confidence, 2 days = medium)
62//!   - Similar entities on same mount path
63//!   - Activity levels (low login counts)
64//!   - Gaps in activity (reduces confidence for sporadic access)
65//!
66//! # Output
67//!
68//! ## Entity Lifecycle Classification:
69//! - **`new_day_N`**: Entities first seen on day N (not in baseline)
70//! - **`pre_existing_baseline`**: Entities that existed before analysis period
71//!
72//! ## Activity Patterns:
73//! - **consistent**: Appeared in most/all log files
74//! - **sporadic**: Appeared intermittently with gaps
75//! - **declining**: Activity decreased over time
76//! - **`single_burst`**: Appeared only once
77//!
78//! ## Ephemeral Detection:
79//! - Confidence levels: High (≥70%), Medium (50-69%), Low (40-49%)
80//! - Detailed reasoning for each classification
81//! - Top ephemeral entities by confidence
82//! - Pattern statistics and mount path analysis
83//!
84//! # JSON Output Fields
85//!
86//! When using `--output`, each entity record includes:
87//! - `entity_id`: Vault entity identifier
88//! - `display_name`: Human-readable name
89//! - `first_seen_file` / `first_seen_time`: When first observed
90//! - `last_seen_file` / `last_seen_time`: When last observed
91//! - `files_appeared`: List of log files entity was active in
92//! - `total_logins`: Total login count across all files
93//! - `lifecycle`: Entity lifecycle classification
94//! - `activity_pattern`: Behavioral pattern classification
95//! - `is_ephemeral_pattern`: Boolean flag for ephemeral detection
96//! - `ephemeral_confidence`: Confidence score (0.0-1.0)
97//! - `ephemeral_reasons`: Array of human-readable reasons
98//!
99//! Only tracks entities that performed login operations (paths ending in `/login`).
100
101use crate::audit::types::AuditEntry;
102use crate::utils::format::format_number;
103use crate::utils::progress::ProgressBar;
104use crate::utils::reader::open_file;
105use anyhow::{Context, Result};
106use chrono::{DateTime, Utc};
107use serde::{Deserialize, Serialize};
108use std::collections::{HashMap, HashSet};
109use std::fs::File;
110use std::io::{BufRead, BufReader};
111use std::path::Path;
112
113/// Entity mapping from baseline CSV files
114#[derive(Debug, Serialize, Deserialize)]
115struct EntityMapping {
116    display_name: String,
117    mount_path: String,
118    #[allow(dead_code)]
119    mount_accessor: String,
120    #[allow(dead_code)]
121    login_count: usize,
122    #[allow(dead_code)]
123    first_seen: String,
124    #[allow(dead_code)]
125    last_seen: String,
126}
127
128/// Represents an entity's churn status
129#[derive(Debug, Serialize, Clone)]
130struct EntityChurnRecord {
131    entity_id: String,
132    display_name: String,
133    mount_path: String,
134    mount_type: String,
135    token_type: String,
136    first_seen_file: String,
137    first_seen_time: DateTime<Utc>,
138    last_seen_file: String,
139    last_seen_time: DateTime<Utc>,
140    files_appeared: Vec<String>,
141    total_logins: usize,
142    lifecycle: String, // "new_day_1", "new_day_2", "new_day_3", "pre_existing"
143    activity_pattern: String, // "consistent", "sporadic", "declining", "single_burst", "unknown"
144    is_ephemeral_pattern: bool,
145    ephemeral_confidence: f32, // 0.0 to 1.0
146    ephemeral_reasons: Vec<String>,
147    // Baseline metadata (if entity existed in baseline)
148    #[serde(skip_serializing_if = "Option::is_none")]
149    baseline_entity_name: Option<String>,
150    #[serde(skip_serializing_if = "Option::is_none")]
151    baseline_created: Option<String>,
152    #[serde(skip_serializing_if = "Option::is_none")]
153    baseline_alias_name: Option<String>,
154    #[serde(skip_serializing_if = "Option::is_none")]
155    baseline_mount_path: Option<String>,
156    // Entity-map metadata (from historical audit logs via preprocess-entities)
157    #[serde(skip_serializing_if = "Option::is_none")]
158    historical_display_name: Option<String>,
159    #[serde(skip_serializing_if = "Option::is_none")]
160    historical_first_seen: Option<String>,
161    #[serde(skip_serializing_if = "Option::is_none")]
162    historical_last_seen: Option<String>,
163    #[serde(skip_serializing_if = "Option::is_none")]
164    historical_login_count: Option<usize>,
165}
166
167/// CSV-compatible representation of entity churn record
168#[derive(Debug, Serialize)]
169struct EntityChurnRecordCsv {
170    entity_id: String,
171    display_name: String,
172    mount_path: String,
173    mount_type: String,
174    token_type: String,
175    first_seen_file: String,
176    first_seen_time: String,
177    last_seen_file: String,
178    last_seen_time: String,
179    files_appeared: String, // Comma-separated list
180    days_active: usize,
181    total_logins: usize,
182    lifecycle: String,
183    activity_pattern: String,
184    is_ephemeral_pattern: bool,
185    ephemeral_confidence: f32,
186    ephemeral_reasons: String, // Semicolon-separated list
187    baseline_entity_name: String,
188    baseline_created: String,
189    baseline_alias_name: String,
190    baseline_mount_path: String,
191    historical_display_name: String,
192    historical_first_seen: String,
193    historical_last_seen: String,
194    historical_login_count: String,
195}
196
197impl From<EntityChurnRecord> for EntityChurnRecordCsv {
198    fn from(record: EntityChurnRecord) -> Self {
199        Self {
200            entity_id: record.entity_id,
201            display_name: record.display_name,
202            mount_path: record.mount_path,
203            mount_type: record.mount_type,
204            token_type: record.token_type,
205            first_seen_file: record.first_seen_file,
206            first_seen_time: record.first_seen_time.to_rfc3339(),
207            last_seen_file: record.last_seen_file,
208            last_seen_time: record.last_seen_time.to_rfc3339(),
209            files_appeared: record.files_appeared.join(", "),
210            days_active: record.files_appeared.len(),
211            total_logins: record.total_logins,
212            lifecycle: record.lifecycle,
213            activity_pattern: record.activity_pattern,
214            is_ephemeral_pattern: record.is_ephemeral_pattern,
215            ephemeral_confidence: record.ephemeral_confidence,
216            ephemeral_reasons: record.ephemeral_reasons.join("; "),
217            baseline_entity_name: record.baseline_entity_name.unwrap_or_default(),
218            baseline_created: record.baseline_created.unwrap_or_default(),
219            baseline_alias_name: record.baseline_alias_name.unwrap_or_default(),
220            baseline_mount_path: record.baseline_mount_path.unwrap_or_default(),
221            historical_display_name: record.historical_display_name.unwrap_or_default(),
222            historical_first_seen: record.historical_first_seen.unwrap_or_default(),
223            historical_last_seen: record.historical_last_seen.unwrap_or_default(),
224            historical_login_count: record
225                .historical_login_count
226                .map(|n| n.to_string())
227                .unwrap_or_default(),
228        }
229    }
230}
231
232#[derive(Debug, Clone)]
233struct DailyStats {
234    #[allow(dead_code)]
235    file_name: String,
236    new_entities: usize,
237    returning_entities: usize,
238    total_logins: usize,
239}
240
241/// Analyzes entity behavior patterns to detect ephemeral entities
242#[derive(Debug)]
243struct EphemeralPatternAnalyzer {
244    total_files: usize,
245    short_lived_patterns: Vec<ShortLivedPattern>,
246}
247
248#[derive(Debug)]
249struct ShortLivedPattern {
250    days_active: usize,
251    display_name: String,
252    mount_path: String,
253}
254
255impl EphemeralPatternAnalyzer {
256    const fn new(total_files: usize) -> Self {
257        Self {
258            total_files,
259            short_lived_patterns: Vec::new(),
260        }
261    }
262
263    /// Learn patterns from entities that appeared 1-2 days (potential ephemeral patterns)
264    fn learn_from_entities(&mut self, entities: &HashMap<String, EntityChurnRecord>) {
265        for entity in entities.values() {
266            let days_active = entity.files_appeared.len();
267
268            // Learn from entities that appeared 1-2 days only
269            if days_active <= 2 {
270                self.short_lived_patterns.push(ShortLivedPattern {
271                    days_active,
272                    display_name: entity.display_name.clone(),
273                    mount_path: entity.mount_path.clone(),
274                });
275            }
276        }
277    }
278
279    /// Analyze an entity and determine if it matches ephemeral patterns
280    fn analyze_entity(&self, entity: &EntityChurnRecord) -> (bool, f32, Vec<String>) {
281        let days_active = entity.files_appeared.len();
282        let mut confidence = 0.0;
283        let mut reasons = Vec::new();
284
285        // Strong indicators (high confidence)
286        if days_active == 1 {
287            confidence += 0.5;
288            reasons.push(format!("Appeared only 1 day ({})", entity.first_seen_file));
289        } else if days_active == 2 {
290            confidence += 0.3;
291            reasons.push(format!(
292                "Appeared only 2 days: {}, {}",
293                entity.files_appeared.first().unwrap_or(&String::new()),
294                entity.files_appeared.last().unwrap_or(&String::new())
295            ));
296        }
297
298        // Pattern matching: Check if display name follows patterns seen in other short-lived entities
299        if days_active <= 2 {
300            // Count how many other short-lived entities share similar patterns
301            let similar_count = self
302                .short_lived_patterns
303                .iter()
304                .filter(|p| {
305                    // Same mount path
306                    if p.mount_path == entity.mount_path && p.days_active <= 2 {
307                        return true;
308                    }
309                    // Similar naming pattern (e.g., github-repo:* or airflow-*)
310                    if entity.display_name.contains(':') && p.display_name.contains(':') {
311                        let entity_prefix = entity.display_name.split(':').next().unwrap_or("");
312                        let pattern_prefix = p.display_name.split(':').next().unwrap_or("");
313                        if entity_prefix == pattern_prefix && !entity_prefix.is_empty() {
314                            return true;
315                        }
316                    }
317                    false
318                })
319                .count();
320
321            if similar_count > 5 {
322                confidence += 0.2;
323                reasons.push(format!(
324                    "Matches pattern seen in {} other short-lived entities",
325                    similar_count
326                ));
327            } else if similar_count > 0 {
328                confidence += 0.1;
329                reasons.push(format!(
330                    "Similar to {} other short-lived entities",
331                    similar_count
332                ));
333            }
334        }
335
336        // Low activity indicator
337        if entity.total_logins <= 5 && days_active <= 2 {
338            confidence += 0.1;
339            reasons.push(format!(
340                "Low activity: only {} login(s)",
341                entity.total_logins
342            ));
343        }
344
345        // Non-continuous appearance (sporadic pattern suggests not churned, just periodic)
346        if days_active >= 2 {
347            let first_day_idx = entity.files_appeared.first().and_then(|f| {
348                f.split('_')
349                    .next_back()
350                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
351            });
352            let last_day_idx = entity.files_appeared.last().and_then(|f| {
353                f.split('_')
354                    .next_back()
355                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
356            });
357
358            if let (Some(first), Some(last)) = (first_day_idx, last_day_idx) {
359                let span = last - first + 1;
360                if span > days_active {
361                    // Gaps in activity - reduce confidence
362                    confidence *= 0.7;
363                    reasons.push(
364                        "Has gaps in activity (possibly sporadic access, not churned)".to_string(),
365                    );
366                }
367            }
368        }
369
370        // Cap confidence and determine ephemeral status
371        confidence = f32::min(confidence, 1.0);
372        let is_ephemeral = confidence >= 0.4; // Threshold for classification
373
374        // Add absence indicator if not seen in recent files
375        if is_ephemeral && days_active < self.total_files {
376            reasons.push(format!(
377                "Not seen in most recent {} file(s)",
378                self.total_files - days_active
379            ));
380        }
381
382        (is_ephemeral, confidence, reasons)
383    }
384
385    /// Determine activity pattern based on appearance across files
386    fn classify_activity_pattern(&self, entity: &EntityChurnRecord) -> String {
387        let days_active = entity.files_appeared.len();
388
389        if days_active == 1 {
390            return "single_burst".to_string();
391        }
392
393        if days_active == self.total_files {
394            return "consistent".to_string();
395        }
396
397        if days_active >= (self.total_files * 2) / 3 {
398            return "consistent".to_string();
399        }
400
401        // Check if activity is declining (appeared early but stopped)
402        if let (Some(_first_file), Some(last_file)) =
403            (entity.files_appeared.first(), entity.files_appeared.last())
404        {
405            // Simple heuristic: if last seen was in first half of files, it's declining
406            let last_file_num = last_file
407                .split('_')
408                .next_back()
409                .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
410                .unwrap_or(self.total_files);
411
412            if last_file_num < self.total_files / 2 {
413                return "declining".to_string();
414            }
415        }
416
417        if days_active <= 2 {
418            return "single_burst".to_string();
419        }
420
421        "sporadic".to_string()
422    }
423}
424
425fn get_file_size(path: &str) -> Result<u64> {
426    Ok(std::fs::metadata(path)?.len())
427}
428
429fn load_entity_mappings(path: &str) -> Result<HashMap<String, EntityMapping>> {
430    let file = File::open(path).context("Failed to open entity map file")?;
431    let mappings: HashMap<String, EntityMapping> =
432        serde_json::from_reader(file).context("Failed to parse entity map JSON")?;
433    Ok(mappings)
434}
435
436#[derive(Debug, Deserialize, Clone)]
437#[allow(dead_code)]
438struct BaselineEntity {
439    entity_id: String,
440    // Fields from entity-list (Vault API) - full metadata
441    #[serde(default)]
442    entity_name: String,
443    #[serde(default)]
444    entity_disabled: bool,
445    #[serde(default)]
446    entity_created: String,
447    #[serde(default)]
448    entity_updated: String,
449    #[serde(default)]
450    alias_id: String,
451    #[serde(default)]
452    alias_name: String,
453    #[serde(default)]
454    mount_path: String,
455    #[serde(default)]
456    mount_type: String,
457    #[serde(default)]
458    mount_accessor: String,
459    #[serde(default)]
460    alias_created: String,
461    #[serde(default)]
462    alias_updated: String,
463    #[serde(default)]
464    alias_metadata: String,
465}
466
467impl BaselineEntity {
468    /// Get the best available name (`entity_name` if available, otherwise `alias_name`)
469    fn get_name(&self) -> String {
470        if !self.entity_name.is_empty() {
471            self.entity_name.clone()
472        } else if !self.alias_name.is_empty() {
473            self.alias_name.clone()
474        } else {
475            String::new()
476        }
477    }
478
479    /// Get the entity creation time
480    fn get_created(&self) -> String {
481        self.entity_created.clone()
482    }
483}
484
485fn load_baseline_entities(path: &str) -> Result<HashMap<String, BaselineEntity>> {
486    let file = File::open(path).context("Failed to open baseline entities file")?;
487
488    // Check if it's JSON or CSV based on file extension
489    let path_lower = path.to_lowercase();
490    if std::path::Path::new(&path_lower)
491        .extension()
492        .is_some_and(|ext| ext.eq_ignore_ascii_case("json"))
493    {
494        // JSON format from entity-list with --format json
495        let entities: Vec<BaselineEntity> =
496            serde_json::from_reader(file).context("Failed to parse baseline entities JSON")?;
497        Ok(entities
498            .into_iter()
499            .map(|e| (e.entity_id.clone(), e))
500            .collect())
501    } else {
502        // CSV format (default entity-list output)
503        let mut reader = csv::Reader::from_reader(file);
504        let mut entities = HashMap::with_capacity(5000); // Pre-allocate for entity mappings
505
506        for result in reader.deserialize() {
507            let entity: BaselineEntity = result.context("Failed to parse baseline CSV row")?;
508            // Use first occurrence of each entity_id (entities can have multiple aliases)
509            entities.entry(entity.entity_id.clone()).or_insert(entity);
510        }
511
512        Ok(entities)
513    }
514}
515
516pub fn run(
517    log_files: &[String],
518    entity_map: Option<&str>,
519    baseline_entities: Option<&str>,
520    output: Option<&str>,
521    format: Option<&str>,
522) -> Result<()> {
523    println!("\n=== Multi-Day Entity Churn Analysis ===\n");
524    println!("Analyzing {} log files:", log_files.len());
525    for (i, file) in log_files.iter().enumerate() {
526        let size = get_file_size(file)?;
527        println!(
528            "  Day {}: {} ({:.2} GB)",
529            i + 1,
530            file,
531            size as f64 / 1_000_000_000.0
532        );
533    }
534    println!();
535
536    // Load baseline entities if provided
537    let baseline = if let Some(path) = baseline_entities {
538        println!(
539            "Loading baseline entity list (Vault API metadata) from {}...",
540            path
541        );
542        let baseline_set = load_baseline_entities(path)?;
543        println!(
544            "Loaded {} pre-existing entities from Vault API baseline",
545            format_number(baseline_set.len())
546        );
547        println!();
548        Some(baseline_set)
549    } else {
550        println!("No baseline entity list provided. Cannot distinguish truly NEW entities from pre-existing.");
551        println!("   All Day 1 entities will be marked as 'pre_existing_or_new_day_1'.");
552        println!("   To get accurate results, run: ./vault-audit entity-list --output baseline_entities.json\n");
553        None
554    };
555
556    // Load entity mappings if provided (historical data from audit logs)
557    let entity_mappings = if let Some(path) = entity_map {
558        println!(
559            "Loading historical entity mappings (audit log enrichment) from {}...",
560            path
561        );
562        let mappings = load_entity_mappings(path)?;
563        println!(
564            "Loaded {} entity mappings with historical audit log data",
565            format_number(mappings.len())
566        );
567        println!();
568        Some(mappings)
569    } else {
570        None
571    };
572
573    // Track all entities across all files
574    // Pre-allocate for typical entity counts in enterprise environments
575    let mut entities: HashMap<String, EntityChurnRecord> = HashMap::with_capacity(5000);
576    let mut daily_stats: Vec<DailyStats> = Vec::new();
577
578    // Process each log file in order
579    for (file_idx, log_file) in log_files.iter().enumerate() {
580        let file_name = Path::new(log_file)
581            .file_name()
582            .unwrap()
583            .to_string_lossy()
584            .to_string();
585
586        eprintln!("\nProcessing Day {} ({})...", file_name, file_idx + 1);
587
588        // Count lines in file first for accurate progress tracking
589        eprintln!("Scanning file to determine total lines...");
590        let total_file_lines = crate::utils::parallel::count_file_lines(log_file)?;
591
592        let file = open_file(log_file)
593            .with_context(|| format!("Failed to open log file: {}", log_file))?;
594
595        let reader = BufReader::new(file);
596        let progress = ProgressBar::new(total_file_lines, "Processing");
597
598        let mut new_entities_this_file = 0;
599        let mut returning_entities_this_file = HashSet::new();
600        let mut logins_this_file = 0;
601        let mut lines_processed = 0;
602
603        for line in reader.lines() {
604            let line = line.context("Failed to read line from log file")?;
605            lines_processed += 1;
606
607            // Update progress periodically
608            if lines_processed % 10_000 == 0 {
609                progress.update(lines_processed);
610            }
611
612            let trimmed = line.trim();
613            if trimmed.is_empty() {
614                continue;
615            }
616
617            let entry: AuditEntry = match serde_json::from_str(trimmed) {
618                Ok(e) => e,
619                Err(_) => continue,
620            };
621
622            // Only process login operations (auth paths ending in /login)
623            let Some(ref request) = entry.request else {
624                continue;
625            };
626            let Some(ref path) = request.path else {
627                continue;
628            };
629            if !path.ends_with("/login") {
630                continue;
631            }
632
633            logins_this_file += 1;
634
635            // Extract entity info
636            let Some(ref auth) = entry.auth else {
637                continue;
638            };
639            let Some(ref entity_id) = auth.entity_id else {
640                continue;
641            };
642
643            let display_name = auth
644                .display_name
645                .clone()
646                .unwrap_or_else(|| entity_id.clone());
647            let mount_path = request.path.clone().unwrap_or_default();
648            let mount_type = request.mount_type.clone().unwrap_or_default();
649            let token_type = auth.token_type.clone().unwrap_or_default();
650
651            // Parse timestamp
652            let first_seen_time = chrono::DateTime::parse_from_rfc3339(&entry.time)
653                .ok()
654                .map_or_else(Utc::now, |dt| dt.with_timezone(&Utc));
655
656            // Check if this entity exists from a previous file
657            if let Some(entity_record) = entities.get_mut(entity_id) {
658                // Returning entity
659                entity_record.total_logins += 1;
660                entity_record.last_seen_file.clone_from(&file_name);
661                entity_record.last_seen_time = first_seen_time;
662                if !entity_record.files_appeared.contains(&file_name) {
663                    entity_record.files_appeared.push(file_name.clone());
664                }
665                returning_entities_this_file.insert(entity_id.clone());
666            } else {
667                // New entity (first time across all files)
668                new_entities_this_file += 1;
669
670                // Determine lifecycle based on baseline and which file this is
671                let lifecycle = if let Some(ref baseline_set) = baseline {
672                    if baseline_set.contains_key(entity_id) {
673                        "pre_existing_baseline".to_string()
674                    } else {
675                        // Not in baseline, so truly NEW during analysis period
676                        match file_idx {
677                            0 => "new_day_1".to_string(),
678                            1 => "new_day_2".to_string(),
679                            2 => "new_day_3".to_string(),
680                            _ => format!("new_day_{}", file_idx + 1),
681                        }
682                    }
683                } else {
684                    // No baseline provided, can't distinguish
685                    match file_idx {
686                        0 => "pre_existing_or_new_day_1".to_string(),
687                        1 => "new_day_2".to_string(),
688                        2 => "new_day_3".to_string(),
689                        _ => format!("new_day_{}", file_idx + 1),
690                    }
691                };
692
693                // Get baseline metadata if entity exists in baseline
694                let (
695                    baseline_entity_name,
696                    baseline_created,
697                    baseline_alias_name,
698                    baseline_mount_path,
699                ) = if let Some(ref baseline_map) = baseline {
700                    if let Some(baseline_entity) = baseline_map.get(entity_id) {
701                        let name = baseline_entity.get_name();
702                        let created = baseline_entity.get_created();
703                        (
704                            if name.is_empty() { None } else { Some(name) },
705                            if created.is_empty() {
706                                None
707                            } else {
708                                Some(created)
709                            },
710                            if baseline_entity.alias_name.is_empty() {
711                                None
712                            } else {
713                                Some(baseline_entity.alias_name.clone())
714                            },
715                            if baseline_entity.mount_path.is_empty() {
716                                None
717                            } else {
718                                Some(baseline_entity.mount_path.clone())
719                            },
720                        )
721                    } else {
722                        (None, None, None, None)
723                    }
724                } else {
725                    (None, None, None, None)
726                };
727
728                // Fetch historical data from entity_mappings
729                let (
730                    historical_display_name,
731                    historical_first_seen,
732                    historical_last_seen,
733                    historical_login_count,
734                ) = if let Some(ref mappings) = entity_mappings {
735                    if let Some(mapping) = mappings.get(entity_id) {
736                        (
737                            Some(mapping.display_name.clone()),
738                            Some(mapping.first_seen.clone()),
739                            Some(mapping.last_seen.clone()),
740                            Some(mapping.login_count),
741                        )
742                    } else {
743                        (None, None, None, None)
744                    }
745                } else {
746                    (None, None, None, None)
747                };
748
749                entities.insert(
750                    entity_id.clone(),
751                    EntityChurnRecord {
752                        entity_id: entity_id.clone(),
753                        display_name: display_name.clone(),
754                        mount_path: mount_path.clone(),
755                        mount_type: mount_type.clone(),
756                        token_type: token_type.clone(),
757                        first_seen_file: file_name.clone(),
758                        first_seen_time,
759                        last_seen_file: file_name.clone(),
760                        last_seen_time: first_seen_time,
761                        files_appeared: vec![file_name.clone()],
762                        total_logins: 1,
763                        lifecycle,
764                        activity_pattern: "unknown".to_string(), // Will be computed in second pass
765                        is_ephemeral_pattern: false,             // Will be computed in second pass
766                        ephemeral_confidence: 0.0,               // Will be computed in second pass
767                        ephemeral_reasons: Vec::new(),           // Will be computed in second pass
768                        baseline_entity_name,
769                        baseline_created,
770                        baseline_alias_name,
771                        baseline_mount_path,
772                        historical_display_name,
773                        historical_first_seen,
774                        historical_last_seen,
775                        historical_login_count,
776                    },
777                );
778            }
779        }
780
781        // Ensure 100% progress for this file
782        progress.update(total_file_lines);
783        progress.finish();
784
785        daily_stats.push(DailyStats {
786            file_name,
787            new_entities: new_entities_this_file,
788            returning_entities: returning_entities_this_file.len(),
789            total_logins: logins_this_file,
790        });
791
792        eprintln!(
793            "Day {} Summary: {} new entities, {} returning, {} logins",
794            file_idx + 1,
795            format_number(new_entities_this_file),
796            format_number(returning_entities_this_file.len()),
797            format_number(logins_this_file)
798        );
799    }
800
801    // === SECOND PASS: Analyze patterns and classify entities ===
802    println!("\nAnalyzing entity behavior patterns...");
803
804    let mut analyzer = EphemeralPatternAnalyzer::new(log_files.len());
805
806    // Step 1: Learn patterns from short-lived entities
807    analyzer.learn_from_entities(&entities);
808    println!(
809        "Learned from {} short-lived entity patterns",
810        format_number(analyzer.short_lived_patterns.len())
811    );
812
813    // Step 2: Classify all entities using learned patterns
814    let entity_ids: Vec<String> = entities.keys().cloned().collect();
815    for entity_id in entity_ids {
816        if let Some(entity) = entities.get_mut(&entity_id) {
817            // Classify activity pattern
818            entity.activity_pattern = analyzer.classify_activity_pattern(entity);
819
820            // Analyze for ephemeral patterns
821            let (is_ephemeral, confidence, reasons) = analyzer.analyze_entity(entity);
822            entity.is_ephemeral_pattern = is_ephemeral;
823            entity.ephemeral_confidence = confidence;
824            entity.ephemeral_reasons = reasons;
825        }
826    }
827
828    // Generate final report
829    println!("\n=== Entity Churn Analysis ===\n");
830
831    println!("Daily Breakdown:");
832    for (idx, stats) in daily_stats.iter().enumerate() {
833        println!(
834            "  Day {}: {} new, {} returning, {} total logins",
835            idx + 1,
836            format_number(stats.new_entities),
837            format_number(stats.returning_entities),
838            format_number(stats.total_logins)
839        );
840    }
841
842    // Lifecycle classification
843    let mut lifecycle_counts: HashMap<String, usize> = HashMap::with_capacity(20); // Small set of lifecycle categories
844    let mut entities_by_file_count: HashMap<usize, usize> = HashMap::with_capacity(log_files.len());
845
846    for entity in entities.values() {
847        *lifecycle_counts
848            .entry(entity.lifecycle.clone())
849            .or_insert(0) += 1;
850        *entities_by_file_count
851            .entry(entity.files_appeared.len())
852            .or_insert(0) += 1;
853    }
854
855    println!("\nEntity Lifecycle Classification:");
856    let mut lifecycle_vec: Vec<_> = lifecycle_counts.iter().collect();
857    lifecycle_vec.sort_by_key(|(k, _)| *k);
858    for (lifecycle, count) in lifecycle_vec {
859        println!("  {}: {}", lifecycle, format_number(*count));
860    }
861
862    println!("\nEntity Persistence:");
863    for day_count in 1..=log_files.len() {
864        if let Some(count) = entities_by_file_count.get(&day_count) {
865            let label = if day_count == 1 {
866                "Appeared 1 day only"
867            } else if day_count == log_files.len() {
868                "Appeared all days (persistent)"
869            } else {
870                "Appeared some days"
871            };
872            println!(
873                "  {} day(s): {} entities ({})",
874                day_count,
875                format_number(*count),
876                label
877            );
878        }
879    }
880
881    // Activity pattern analysis
882    let mut activity_pattern_counts: HashMap<String, usize> = HashMap::with_capacity(10); // Small set of activity patterns
883    let mut ephemeral_entities = Vec::new();
884
885    for entity in entities.values() {
886        *activity_pattern_counts
887            .entry(entity.activity_pattern.clone())
888            .or_insert(0) += 1;
889
890        if entity.is_ephemeral_pattern {
891            ephemeral_entities.push(entity.clone());
892        }
893    }
894
895    println!("\nActivity Pattern Distribution:");
896    let mut pattern_vec: Vec<_> = activity_pattern_counts.iter().collect();
897    pattern_vec.sort_by(|a, b| b.1.cmp(a.1));
898    for (pattern, count) in pattern_vec {
899        println!("  {}: {}", pattern, format_number(*count));
900    }
901
902    println!("\nEphemeral Entity Detection:");
903    println!(
904        "  Detected {} likely ephemeral entities (confidence ≥ 0.4)",
905        format_number(ephemeral_entities.len())
906    );
907
908    if !ephemeral_entities.is_empty() {
909        // Sort by confidence
910        ephemeral_entities.sort_by(|a, b| {
911            b.ephemeral_confidence
912                .partial_cmp(&a.ephemeral_confidence)
913                .unwrap_or(std::cmp::Ordering::Equal)
914        });
915
916        println!("  Top 10 by confidence:");
917        for (idx, entity) in ephemeral_entities.iter().take(10).enumerate() {
918            println!(
919                "    {}. {} (confidence: {:.1}%)",
920                idx + 1,
921                entity.display_name,
922                entity.ephemeral_confidence * 100.0
923            );
924            for reason in &entity.ephemeral_reasons {
925                println!("       - {}", reason);
926            }
927        }
928
929        // Breakdown by confidence ranges
930        let high_conf = ephemeral_entities
931            .iter()
932            .filter(|e| e.ephemeral_confidence >= 0.7)
933            .count();
934        let med_conf = ephemeral_entities
935            .iter()
936            .filter(|e| e.ephemeral_confidence >= 0.5 && e.ephemeral_confidence < 0.7)
937            .count();
938        let low_conf = ephemeral_entities
939            .iter()
940            .filter(|e| e.ephemeral_confidence >= 0.4 && e.ephemeral_confidence < 0.5)
941            .count();
942
943        println!("\n  Confidence distribution:");
944        println!("    High (≥70%): {}", format_number(high_conf));
945        println!("    Medium (50-69%): {}", format_number(med_conf));
946        println!("    Low (40-49%): {}", format_number(low_conf));
947    }
948
949    // Mount path breakdown
950    let mut mount_stats: HashMap<String, (usize, String)> = HashMap::with_capacity(100); // Typical: dozens of mount points
951    for entity in entities.values() {
952        let entry = mount_stats
953            .entry(entity.mount_path.clone())
954            .or_insert_with(|| (0, entity.mount_type.clone()));
955        entry.0 += 1;
956    }
957
958    println!("\nTop Authentication Methods (Total Entities):");
959    let mut mount_vec: Vec<_> = mount_stats.iter().collect();
960    mount_vec.sort_by(|a, b| b.1 .0.cmp(&a.1 .0));
961
962    for (idx, (path, (count, mount_type))) in mount_vec.iter().take(20).enumerate() {
963        println!(
964            "  {}. {} ({}): {}",
965            idx + 1,
966            path,
967            mount_type,
968            format_number(*count)
969        );
970    }
971
972    // Calculate GitHub duplication if present
973    let github_entities: Vec<_> = entities
974        .values()
975        .filter(|e| e.mount_path.contains("/github"))
976        .collect();
977
978    if !github_entities.is_empty() {
979        println!("\n=== GitHub Entity Analysis ===");
980        println!(
981            "Total GitHub entities: {}",
982            format_number(github_entities.len())
983        );
984
985        // Extract repo names and count duplicates
986        let mut repo_counts: HashMap<String, usize> = HashMap::new();
987        for entity in &github_entities {
988            // Extract repo from "github-repo:org/repo:..." pattern
989            if let Some(repo) = entity.display_name.split(':').nth(1) {
990                *repo_counts.entry(repo.to_string()).or_insert(0) += 1;
991            }
992        }
993
994        println!("Unique repositories: {}", format_number(repo_counts.len()));
995        println!("\nTop repositories by entity count:");
996        let mut repo_vec: Vec<_> = repo_counts.iter().collect();
997        repo_vec.sort_by(|a, b| b.1.cmp(a.1));
998
999        for (idx, (repo, count)) in repo_vec.iter().take(20).enumerate() {
1000            if **count > 1 {
1001                println!(
1002                    "  {}. {}: {} entities",
1003                    idx + 1,
1004                    repo,
1005                    format_number(**count)
1006                );
1007            }
1008        }
1009    }
1010
1011    // Export to file if requested
1012    if let Some(output_path) = output {
1013        let mut entities_vec: Vec<_> = entities.into_values().collect();
1014        entities_vec.sort_by(|a, b| a.first_seen_time.cmp(&b.first_seen_time));
1015
1016        // Determine format from parameter or file extension
1017        let output_format = format.unwrap_or_else(|| {
1018            if std::path::Path::new(output_path)
1019                .extension()
1020                .is_some_and(|ext| ext.eq_ignore_ascii_case("csv"))
1021            {
1022                "csv"
1023            } else {
1024                "json"
1025            }
1026        });
1027
1028        println!(
1029            "\nExporting detailed entity records to {} (format: {})...",
1030            output_path, output_format
1031        );
1032
1033        let output_file = File::create(output_path)
1034            .with_context(|| format!("Failed to create output file: {}", output_path))?;
1035
1036        match output_format {
1037            "csv" => {
1038                let mut writer = csv::Writer::from_writer(output_file);
1039                for entity in &entities_vec {
1040                    let csv_record: EntityChurnRecordCsv = entity.clone().into();
1041                    writer
1042                        .serialize(&csv_record)
1043                        .context("Failed to write CSV record")?;
1044                }
1045                writer.flush().context("Failed to flush CSV writer")?;
1046            }
1047            _ => {
1048                // Default to JSON
1049                serde_json::to_writer_pretty(output_file, &entities_vec)
1050                    .context("Failed to write JSON output")?;
1051            }
1052        }
1053
1054        println!(
1055            "Exported {} entity records",
1056            format_number(entities_vec.len())
1057        );
1058    }
1059
1060    println!("\n=== Analysis Complete ===\n");
1061    Ok(())
1062}