Kubernetes Troubleshooting
Essential kubectl Commands
Pod Inspection
# Get pods with details
kubectl get pods -n production -o wide
kubectl get pods --all-namespaces
kubectl get pods --field-selector status.phase=Running
kubectl get pods --selector app=web-app
# Describe pod (shows events)
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production
# Get pod logs
kubectl logs web-app-7d5c8b9f4-xk2pm -n production
kubectl logs web-app-7d5c8b9f4-xk2pm -n production --previous # Previous container
kubectl logs web-app-7d5c8b9f4-xk2pm -n production -c init-container
kubectl logs -f web-app-7d5c8b9f4-xk2pm -n production # Follow logs
kubectl logs --tail=100 web-app-7d5c8b9f4-xk2pm -n production
kubectl logs --since=1h web-app-7d5c8b9f4-xk2pm -n production
# Get all pod logs from deployment
kubectl logs deployment/web-app -n production --all-containers=true
# Execute commands in pod
kubectl exec -it web-app-7d5c8b9f4-xk2pm -n production -- /bin/sh
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- env
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- cat /etc/config/app.yaml
# Copy files to/from pod
kubectl cp web-app-7d5c8b9f4-xk2pm:/app/logs/app.log ./app.log -n production
kubectl cp ./config.yaml web-app-7d5c8b9f4-xk2pm:/tmp/config.yaml -n production
# Port forward
kubectl port-forward web-app-7d5c8b9f4-xk2pm 8080:8080 -n production
kubectl port-forward service/web-app 8080:80 -n production
Deployment Debugging
# Check deployment status
kubectl get deployment web-app -n production
kubectl describe deployment web-app -n production
kubectl rollout status deployment/web-app -n production
kubectl rollout history deployment/web-app -n production
# Check replica sets
kubectl get rs -n production
kubectl describe rs web-app-7d5c8b9f4 -n production
# Scale deployment
kubectl scale deployment web-app --replicas=5 -n production
# Rollback deployment
kubectl rollout undo deployment/web-app -n production
kubectl rollout undo deployment/web-app --to-revision=2 -n production
# Restart deployment (recreate pods)
kubectl rollout restart deployment/web-app -n production
Service and Network Debugging
# Get services
kubectl get svc -n production
kubectl describe svc web-app -n production
# Get endpoints
kubectl get endpoints web-app -n production
kubectl describe endpoints web-app -n production
# Get ingress
kubectl get ingress -n production
kubectl describe ingress web-app -n production
# Get network policies
kubectl get networkpolicy -n production
kubectl describe networkpolicy frontend-to-backend -n production
Resource and Configuration
# Get ConfigMaps and Secrets
kubectl get configmap -n production
kubectl describe configmap app-config -n production
kubectl get configmap app-config -n production -o yaml
kubectl get secret -n production
kubectl describe secret app-secrets -n production
kubectl get secret app-secrets -n production -o jsonpath='{.data.password}' | base64 -d
# Get PVCs and PVs
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production
kubectl get pv
# Get events (sorted by timestamp)
kubectl get events -n production --sort-by='.lastTimestamp'
kubectl get events -n production --field-selector involvedObject.name=web-app-7d5c8b9f4-xk2pm
Debug Pod
Ephemeral Debug Container
# Attach debug container to running pod
kubectl debug -it web-app-7d5c8b9f4-xk2pm -n production \
--image=busybox:latest \
--target=web-app
# Create copy of pod with debug tools
kubectl debug web-app-7d5c8b9f4-xk2pm -n production \
-it \
--image=ubuntu:latest \
--share-processes \
--copy-to=web-app-debug
# Debug with different image
kubectl debug web-app-7d5c8b9f4-xk2pm -n production \
-it \
--image=nicolaka/netshoot:latest \
--target=web-app
Debug on Node
# Create privileged pod on specific node
kubectl debug node/node-01 -it --image=ubuntu:latest
# Access node filesystem
kubectl debug node/node-01 -it --image=ubuntu:latest -- chroot /host
Common Issues and Solutions
Issue 1: Pod in Pending State
# Check pod status and events
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production
# Common causes:
# 1. Insufficient resources
kubectl top nodes
kubectl describe nodes
# 2. PVC not bound
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production
# 3. ImagePullBackOff
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 Events
# 4. Node selector/affinity issues
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production -o yaml | grep -A 5 nodeSelector
Issue 2: CrashLoopBackOff
# Check logs from crashed container
kubectl logs web-app-7d5c8b9f4-xk2pm -n production --previous
# Check if liveness probe is failing
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Liveness"
# Debug with different command
kubectl run debug-pod --image=myapp:latest -it --rm --restart=Never -- /bin/sh
# Check resource limits
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Limits"
Issue 3: ImagePullBackOff
# Check image pull secret
kubectl get secret registry-credentials -n production -o yaml
# Test image pull manually
kubectl run test-pull --image=myregistry.io/myapp:v1.2.0 \
--image-pull-policy=Always \
--restart=Never \
-n production
# Create/update image pull secret
kubectl create secret docker-registry registry-credentials \
--docker-server=myregistry.io \
--docker-username=myuser \
--docker-password=mypassword \
[email protected] \
-n production
Issue 4: Service Not Accessible
# Check service endpoints
kubectl get endpoints web-app -n production
kubectl describe endpoints web-app -n production
# Verify pod labels match service selector
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production --show-labels
kubectl get service web-app -n production -o yaml | grep -A 3 selector
# Test service connectivity from debug pod
kubectl run debug --image=nicolaka/netshoot:latest -it --rm -n production -- bash
# Inside pod:
curl http://web-app.production.svc.cluster.local
nslookup web-app.production.svc.cluster.local
telnet web-app.production.svc.cluster.local 80
Issue 5: DNS Resolution Issues
# Check CoreDNS pods
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -n kube-system -l k8s-app=kube-dns
# Test DNS resolution
kubectl run dnsutils --image=tutum/dnsutils -it --rm -- bash
# Inside pod:
nslookup kubernetes.default
nslookup web-app.production.svc.cluster.local
dig web-app.production.svc.cluster.local
# Check DNS config in pod
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- cat /etc/resolv.conf
Issue 6: NetworkPolicy Blocking Traffic
# List network policies
kubectl get networkpolicy -n production
kubectl describe networkpolicy default-deny-all -n production
# Test connectivity
kubectl run test-connectivity --image=nicolaka/netshoot:latest -it --rm -n production -- bash
# Inside pod:
curl -v http://web-app:80
nc -zv web-app 80
# Temporarily allow all traffic (testing only)
kubectl delete networkpolicy --all -n production
Issue 7: High Resource Usage
# Check resource usage
kubectl top nodes
kubectl top pods -n production
kubectl top pod web-app-7d5c8b9f4-xk2pm -n production --containers
# Check resource requests and limits
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Limits"
# Get pods sorted by CPU/memory usage
kubectl top pods -n production --sort-by=cpu
kubectl top pods -n production --sort-by=memory
# Check node capacity
kubectl describe node node-01 | grep -A 10 "Allocated resources"
Issue 8: PersistentVolumeClaim Issues
# Check PVC status
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production
# Check PV status
kubectl get pv
kubectl describe pv pvc-abc123
# Check storage class
kubectl get storageclass
kubectl describe storageclass fast-ssd
# Events related to PVC
kubectl get events -n production --field-selector involvedObject.name=database-pvc
Advanced Debugging
API Server Debugging
# Enable verbose output
kubectl get pods -n production -v=9
# Check API server logs (on master node)
journalctl -u kube-apiserver -f
# Check cluster info
kubectl cluster-info
kubectl cluster-info dump > cluster-dump.txt
RBAC Debugging
# Check if ServiceAccount can perform action
kubectl auth can-i get pods --as=system:serviceaccount:production:web-app-sa -n production
# List permissions for ServiceAccount
kubectl describe sa web-app-sa -n production
kubectl describe role web-app-role -n production
kubectl describe rolebinding web-app-rolebinding -n production
# Check all permissions
kubectl auth can-i --list --as=system:serviceaccount:production:web-app-sa -n production
Performance Debugging
# Get resource metrics
kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
kubectl get --raw /apis/metrics.k8s.io/v1beta1/pods
# Check pod overhead
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production -o json | jq '.spec.overhead'
# Check priority classes
kubectl get priorityclasses
kubectl describe priorityclass high-priority
Diagnostic Tools
Network Tools Container
apiVersion: v1
kind: Pod
metadata:
name: netshoot
namespace: production
spec:
containers:
- name: netshoot
image: nicolaka/netshoot:latest
command: ["/bin/sleep", "3600"]
restartPolicy: Never
Database Client Container
apiVersion: v1
kind: Pod
metadata:
name: postgres-client
namespace: production
spec:
containers:
- name: postgres
image: postgres:15-alpine
command: ["/bin/sleep", "3600"]
env:
- name: PGHOST
value: postgres-service
- name: PGUSER
value: myapp
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: postgres-secrets
key: password
restartPolicy: Never
Quick Reference
Pod States
- Pending: Waiting to be scheduled
- ContainerCreating: Pulling image / creating container
- Running: Pod is running
- Succeeded: All containers exited successfully
- Failed: At least one container failed
- CrashLoopBackOff: Container keeps crashing
- ImagePullBackOff: Cannot pull image
- ErrImagePull: Image pull error
- Unknown: Cannot get pod status
Common Exit Codes
- 0: Success
- 1: General error
- 137: SIGKILL (OOMKilled - out of memory)
- 139: SIGSEGV (segmentation fault)
- 143: SIGTERM (graceful termination)
Best Practices
- Logs: Always check logs first with
kubectl logs - Events: Use
kubectl describe to see events - Labels: Use consistent labels for easier debugging
- Resources: Set appropriate requests and limits
- Health Checks: Implement proper liveness and readiness probes
- Monitoring: Set up comprehensive monitoring and alerting
- Debug Tools: Keep debug containers ready
- Documentation: Document common issues and solutions