update
This commit is contained in:
93
kubespray/roles/recover_control_plane/etcd/tasks/main.yml
Normal file
93
kubespray/roles/recover_control_plane/etcd/tasks/main.yml
Normal file
@@ -0,0 +1,93 @@
|
||||
---
|
||||
- name: Get etcd endpoint health
|
||||
command: "{{ bin_dir }}/etcdctl endpoint health"
|
||||
register: etcd_endpoint_health
|
||||
ignore_errors: true # noqa ignore-errors
|
||||
changed_when: false
|
||||
check_mode: no
|
||||
environment:
|
||||
ETCDCTL_API: 3
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
||||
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
|
||||
- name: Set healthy fact
|
||||
set_fact:
|
||||
healthy: "{{ etcd_endpoint_health.stderr is match('Error: unhealthy cluster') }}"
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
|
||||
- name: Set has_quorum fact
|
||||
set_fact:
|
||||
has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}"
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
|
||||
- include_tasks: recover_lost_quorum.yml
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
- not has_quorum
|
||||
|
||||
- name: Remove etcd data dir
|
||||
file:
|
||||
path: "{{ etcd_data_dir }}"
|
||||
state: absent
|
||||
delegate_to: "{{ item }}"
|
||||
with_items: "{{ groups['broken_etcd'] }}"
|
||||
ignore_errors: true # noqa ignore-errors
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
- has_quorum
|
||||
|
||||
- name: Delete old certificates
|
||||
# noqa 302 ignore-error - rm is ok here for now
|
||||
shell: "rm {{ etcd_cert_dir }}/*{{ item }}*"
|
||||
with_items: "{{ groups['broken_etcd'] }}"
|
||||
register: delete_old_cerificates
|
||||
ignore_errors: true
|
||||
when: groups['broken_etcd']
|
||||
|
||||
- name: Fail if unable to delete old certificates
|
||||
fail:
|
||||
msg: "Unable to delete old certificates for: {{ item.item }}"
|
||||
loop: "{{ delete_old_cerificates.results }}"
|
||||
changed_when: false
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
- "item.rc != 0 and not 'No such file or directory' in item.stderr"
|
||||
|
||||
- name: Get etcd cluster members
|
||||
command: "{{ bin_dir }}/etcdctl member list"
|
||||
register: member_list
|
||||
changed_when: false
|
||||
check_mode: no
|
||||
environment:
|
||||
ETCDCTL_API: 3
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
||||
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
- not healthy
|
||||
- has_quorum
|
||||
|
||||
- name: Remove broken cluster members
|
||||
command: "{{ bin_dir }}/etcdctl member remove {{ item[1].replace(' ','').split(',')[0] }}"
|
||||
environment:
|
||||
ETCDCTL_API: 3
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
||||
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
||||
with_nested:
|
||||
- "{{ groups['broken_etcd'] }}"
|
||||
- "{{ member_list.stdout_lines }}"
|
||||
when:
|
||||
- groups['broken_etcd']
|
||||
- not healthy
|
||||
- has_quorum
|
||||
- hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2]
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Save etcd snapshot
|
||||
command: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db"
|
||||
environment:
|
||||
- ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
- ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
||||
- ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses.split(',') | first }}"
|
||||
- ETCDCTL_API: 3
|
||||
when: etcd_snapshot is not defined
|
||||
|
||||
- name: Transfer etcd snapshot to host
|
||||
copy:
|
||||
src: "{{ etcd_snapshot }}"
|
||||
dest: /tmp/snapshot.db
|
||||
mode: 0640
|
||||
when: etcd_snapshot is defined
|
||||
|
||||
- name: Stop etcd
|
||||
systemd:
|
||||
name: etcd
|
||||
state: stopped
|
||||
|
||||
- name: Remove etcd data-dir
|
||||
file:
|
||||
path: "{{ etcd_data_dir }}"
|
||||
state: absent
|
||||
|
||||
- name: Restore etcd snapshot # noqa 301 305
|
||||
shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
|
||||
environment:
|
||||
- ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
- ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
||||
- ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
||||
- ETCDCTL_API: 3
|
||||
|
||||
- name: Remove etcd snapshot
|
||||
file:
|
||||
path: /tmp/snapshot.db
|
||||
state: absent
|
||||
|
||||
- name: Change etcd data-dir owner
|
||||
file:
|
||||
path: "{{ etcd_data_dir }}"
|
||||
owner: etcd
|
||||
group: etcd
|
||||
recurse: true
|
||||
|
||||
- name: Reconfigure etcd
|
||||
replace:
|
||||
path: /etc/etcd.env
|
||||
regexp: "^(ETCD_INITIAL_CLUSTER=).*"
|
||||
replace: '\1{{ etcd_member_name }}={{ etcd_peer_url }}'
|
||||
|
||||
- name: Start etcd
|
||||
systemd:
|
||||
name: etcd
|
||||
state: started
|
||||
Reference in New Issue
Block a user