Huge refactoring to face recognition

s0md3v · Aug 11, 2023 · 6f99280 · 6f99280
1 parent 83d913d
commit 6f99280
Show file tree

Hide file tree

Showing 10 changed files with 200 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -29,29 +29,33 @@ Start the program with arguments:
 ```
 python run.py [options]
 
--h, --help                                                                 show this help message and exit
--s SOURCE_PATH, --source SOURCE_PATH                                       select an source image
--t TARGET_PATH, --target TARGET_PATH                                       select an target image or video
--o OUTPUT_PATH, --output OUTPUT_PATH                                       select output file or directory
---frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...]                 list of available frame processors (choices: face_swapper, face_enhancer, frame_enhancer, ...)
---ui-layouts UI_LAYOUTS [UI_LAYOUTS ...]                                   list of available ui layouts (choices: default, ...)
---keep-fps                                                                 keep target fps
---keep-temp                                                                keep temporary frames
---skip-audio                                                               skip target audio
---many-faces                                                               process every face
---reference-face-position REFERENCE_FACE_POSITION                          position of the reference face
---reference-frame-number REFERENCE_FRAME_NUMBER                            number of the reference frame
---similar-face-distance SIMILAR_FACE_DISTANCE                              face distance used for recognition
---temp-frame-format {jpg,png}                                              image format used for frame extraction
---temp-frame-quality [0-100]                                               image quality used for frame extraction
---output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc}  encoder used for the output video
---output-video-quality [0-100]                                             quality used for the output video
---max-memory MAX_MEMORY                                                    maximum amount of RAM in GB
---execution-providers {cpu} [{cpu} ...]                                    list of available execution providers (choices: cpu, ...)
---execution-thread-count EXECUTION_THREAD_COUNT                            number of execution threads
---execution-queue-count EXECUTION_QUEUE_COUNT                              number of execution queries
--v, --version                                                              show program's version number and exit
-
+-h, --help                                                                                       show this help message and exit
+-s SOURCE_PATH, --source SOURCE_PATH                                                             select an source image
+-t TARGET_PATH, --target TARGET_PATH                                                             select an target image or video
+-o OUTPUT_PATH, --output OUTPUT_PATH                                                             select output file or directory
+--frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...]                                       list of available frame processors (choices: face_swapper, face_enhancer, frame_enhancer, ...)
+--ui-layouts UI_LAYOUTS [UI_LAYOUTS ...]                                                         list of available ui layouts (choices: default, ...)
+--keep-fps                                                                                       keep target fps
+--keep-temp                                                                                      keep temporary frames
+--skip-audio                                                                                     skip target audio
+--face-recognition {reference,many}                                                              face recognition method
+--face-analyser-direction {left-right,right-left,top-bottom,bottom-top,small-large,large-small}  direction used for the face analyser
+--face-analyser-age {children,teenager,adult,senior}                                             age used for the face analyser
+--face-analyser-gender {male,female}                                                             gender used for the face analyser
+--reference-face-position REFERENCE_FACE_POSITION                                                position of the reference face
+--reference-face-distance REFERENCE_FACE_DISTANCE                                                distance between reference face and target face
+--reference-frame-number REFERENCE_FRAME_NUMBER                                                  number of the reference frame
+--trim-frame-start TRIM_FRAME_START                                                              start frame use for extraction
+--trim-frame-end TRIM_FRAME_END                                                                  end frame use for extraction
+--temp-frame-format {jpg,png}                                                                    image format used for frame extraction
+--temp-frame-quality [0-100]                                                                     image quality used for frame extraction
+--output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc}                        encoder used for the output video
+--output-video-quality [0-100]                                                                   quality used for the output video
+--max-memory MAX_MEMORY                                                                          maximum amount of RAM in GB
+--execution-providers {cpu} [{cpu} ...]                                                          list of available execution providers (choices: cpu, ...)
+--execution-thread-count EXECUTION_THREAD_COUNT                                                  number of execution threads
+--execution-queue-count EXECUTION_QUEUE_COUNT                                                    number of execution queries
+-v, --version                                                                                    show program's version number and exit
 ```
 
 

diff --git a/roop/core.py b/roop/core.py
@@ -14,6 +14,8 @@
 import argparse
 import onnxruntime
 import tensorflow
+
+
 import roop.globals
 import roop.metadata
 from roop.predictor import predict_image, predict_video
@@ -35,10 +37,13 @@ def parse_args() -> None:
     program.add_argument('--keep-fps', help='keep target fps', dest='keep_fps', action='store_true')
     program.add_argument('--keep-temp', help='keep temporary frames', dest='keep_temp', action='store_true')
     program.add_argument('--skip-audio', help='skip target audio', dest='skip_audio', action='store_true')
-    program.add_argument('--many-faces', help='process every face', dest='many_faces', action='store_true')
+    program.add_argument('--face-recognition', help='face recognition method', dest='face_recognition', default='reference', choices=['reference', 'many'])
+    program.add_argument('--face-analyser-direction', help='direction used for the face analyser', dest='face_analyser_direction', choices=['left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small'])
+    program.add_argument('--face-analyser-age', help='age used for the face analyser', dest='face_analyser_age', choices=['children', 'teenager', 'adult', 'senior'])
+    program.add_argument('--face-analyser-gender', help='gender used for the face analyser', dest='face_analyser_gender', choices=['male', 'female'])
     program.add_argument('--reference-face-position', help='position of the reference face', dest='reference_face_position', type=int, default=0)
+    program.add_argument('--reference-face-distance', help='distance between reference face and target face', dest='reference_face_distance', type=float, default=0.85)
     program.add_argument('--reference-frame-number', help='number of the reference frame', dest='reference_frame_number', type=int, default=0)
-    program.add_argument('--similar-face-distance', help='face distance used for recognition', dest='similar_face_distance', type=float, default=0.85)
     program.add_argument('--trim-frame-start', help='start frame use for extraction', dest='trim_frame_start', type=int)
     program.add_argument('--trim-frame-end', help='end frame use for extraction', dest='trim_frame_end', type=int)
     program.add_argument('--temp-frame-format', help='image format used for frame extraction', dest='temp_frame_format', default='jpg', choices=['jpg', 'png'])
@@ -62,10 +67,13 @@ def parse_args() -> None:
     roop.globals.keep_fps = args.keep_fps
     roop.globals.keep_temp = args.keep_temp
     roop.globals.skip_audio = args.skip_audio
-    roop.globals.many_faces = args.many_faces
+    roop.globals.face_recognition = args.face_recognition
+    roop.globals.face_analyser_direction = args.face_analyser_direction
+    roop.globals.face_analyser_age = args.face_analyser_age
+    roop.globals.face_analyser_gender = args.face_analyser_gender
     roop.globals.reference_face_position = args.reference_face_position
     roop.globals.reference_frame_number = args.reference_frame_number
-    roop.globals.similar_face_distance = args.similar_face_distance
+    roop.globals.reference_face_distance = args.reference_face_distance
     roop.globals.trim_frame_start = args.trim_frame_start
     roop.globals.trim_frame_end = args.trim_frame_end
     roop.globals.temp_frame_format = args.temp_frame_format

diff --git a/roop/face_analyser.py b/roop/face_analyser.py
@@ -4,7 +4,7 @@
 import numpy
 
 import roop.globals
-from roop.typing import Frame, Face
+from roop.typing import Frame, Face, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender
 
 FACE_ANALYSER = None
 THREAD_LOCK = threading.Lock()
@@ -38,21 +38,68 @@ def get_one_face(frame: Frame, position: int = 0) -> Optional[Face]:
 
 def get_many_faces(frame: Frame) -> Optional[List[Face]]:
     try:
-        return get_face_analyser().get(frame)
+        faces = get_face_analyser().get(frame)
+        if roop.globals.face_analyser_direction:
+            faces = sort_by_direction(faces, roop.globals.face_analyser_direction)
+        if roop.globals.face_analyser_age:
+            faces = filter_by_age(faces, roop.globals.face_analyser_age)
+        if roop.globals.face_analyser_gender:
+            faces = filter_by_gender(faces, roop.globals.face_analyser_gender)
+        return faces
     except (AttributeError, ValueError):
         return None
 
 
-def find_similar_face(frame: Frame, reference_face: Face) -> Optional[Face]:
+def find_similar_face(frame: Frame, reference_face: Face, face_distance: float) -> Optional[Face]:
     many_faces = get_many_faces(frame)
     if many_faces:
         for face in many_faces:
             if hasattr(face, 'normed_embedding') and hasattr(reference_face, 'normed_embedding'):
-                distance = numpy.sum(numpy.square(face.normed_embedding - reference_face.normed_embedding))
-                if distance < roop.globals.similar_face_distance:
+                current_face_distance = numpy.sum(numpy.square(face.normed_embedding - reference_face.normed_embedding))
+                if current_face_distance < face_distance:
                     return face
     return None
 
 
+def sort_by_direction(faces: List[Face], direction: FaceAnalyserDirection) -> List[Face]:
+    if direction == 'left-right':
+        return sorted(faces, key=lambda face: face['bbox'][0])
+    if direction == 'right-left':
+        return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
+    if direction == 'top-bottom':
+        return sorted(faces, key=lambda face: face['bbox'][1])
+    if direction == 'bottom-top':
+        return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
+    if direction == 'small-large':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
+    if direction == 'large-small':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)
+    return faces
+
+
+def filter_by_age(faces: List[Face], age: FaceAnalyserAge) -> List[Face]:
+    filter_faces = []
+    for face in faces:
+        if face['age'] < 10 and age == 'children':
+            filter_faces.append(face)
+        elif face['age'] < 20 and age == 'teenager':
+            filter_faces.append(face)
+        elif face['age'] < 60 and age == 'adult':
+            filter_faces.append(face)
+        elif face['age'] < 100 and age == 'senior':
+            filter_faces.append(face)
+    return filter_faces
+
+
+def filter_by_gender(faces: List[Face], gender: FaceAnalyserGender) -> List[Face]:
+    filter_faces = []
+    for face in faces:
+        if face['gender'] == 1 and gender == 'male':
+            filter_faces.append(face)
+        if face['gender'] == 0 and gender == 'female':
+            filter_faces.append(face)
+    return filter_faces
+
+
 def get_faces_total(frame: Frame) -> int:
     return len(get_many_faces(frame))
diff --git a/roop/globals.py b/roop/globals.py
@@ -1,5 +1,7 @@
 from typing import List, Optional
 
+from roop.typing import FaceRecognition, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender, LogLevel, TempFrameFormat
+
 source_path: Optional[str] = None
 target_path: Optional[str] = None
 output_path: Optional[str] = None
@@ -9,18 +11,21 @@
 keep_fps: Optional[bool] = None
 keep_temp: Optional[bool] = None
 skip_audio: Optional[bool] = None
-many_faces: Optional[bool] = None
+face_recognition: Optional[FaceRecognition] = None
+face_analyser_direction: Optional[FaceAnalyserDirection] = None
+face_analyser_age: Optional[FaceAnalyserAge] = None
+face_analyser_gender: Optional[FaceAnalyserGender] = None
 reference_face_position: Optional[int] = None
 reference_frame_number: Optional[int] = None
-similar_face_distance: Optional[float] = None
+reference_face_distance: Optional[float] = None
 trim_frame_start: Optional[int] = None
 trim_frame_end: Optional[int] = None
-temp_frame_format: Optional[str] = None
+temp_frame_format: Optional[TempFrameFormat] = None
 temp_frame_quality: Optional[int] = None
 output_video_encoder: Optional[str] = None
 output_video_quality: Optional[int] = None
 max_memory: Optional[int] = None
 execution_providers: List[str] = []
 execution_thread_count: Optional[int] = None
 execution_queue_count: Optional[int] = None
-log_level: str = 'error'
+log_level: LogLevel = 'error'
diff --git a/roop/processors/frame/__modules__/face_swapper.py b/roop/processors/frame/__modules__/face_swapper.py
@@ -61,21 +61,21 @@ def swap_face(source_face: Face, target_face: Face, temp_frame: Frame) -> Frame:
 
 
 def process_frame(source_face: Face, reference_face: Face, temp_frame: Frame) -> Frame:
-    if roop.globals.many_faces:
+    if 'reference' in roop.globals.face_recognition:
+        target_face = find_similar_face(temp_frame, reference_face, roop.globals.reference_face_distance)
+        if target_face:
+            temp_frame = swap_face(source_face, target_face, temp_frame)
+    if 'many' in roop.globals.face_recognition:
         many_faces = get_many_faces(temp_frame)
         if many_faces:
             for target_face in many_faces:
                 temp_frame = swap_face(source_face, target_face, temp_frame)
-    else:
-        target_face = find_similar_face(temp_frame, reference_face)
-        if target_face:
-            temp_frame = swap_face(source_face, target_face, temp_frame)
     return temp_frame
 
 
 def process_frames(source_path: str, temp_frame_paths: List[str], update: Callable[[], None]) -> None:
     source_face = get_one_face(cv2.imread(source_path))
-    reference_face = get_face_reference() if not roop.globals.many_faces else None
+    reference_face = get_face_reference() if 'reference' in roop.globals.face_recognition else None
     for temp_frame_path in temp_frame_paths:
         temp_frame = cv2.imread(temp_frame_path)
         result_frame = process_frame(source_face, reference_face, temp_frame)
@@ -87,13 +87,13 @@ def process_frames(source_path: str, temp_frame_paths: List[str], update: Callab
 def process_image(source_path: str, target_path: str, output_path: str) -> None:
     source_face = get_one_face(cv2.imread(source_path))
     target_frame = cv2.imread(target_path)
-    reference_face = get_one_face(target_frame, roop.globals.reference_face_position) if not roop.globals.many_faces else None
+    reference_face = get_one_face(target_frame, roop.globals.reference_face_position) if 'reference' in roop.globals.face_recognition else None
     result_frame = process_frame(source_face, reference_face, target_frame)
     cv2.imwrite(output_path, result_frame)
 
 
 def process_video(source_path: str, temp_frame_paths: List[str]) -> None:
-    if not roop.globals.many_faces and not get_face_reference():
+    if 'reference' in roop.globals.face_recognition and not get_face_reference():
         reference_frame = cv2.imread(temp_frame_paths[roop.globals.reference_frame_number])
         reference_face = get_one_face(reference_frame, roop.globals.reference_face_position)
         set_face_reference(reference_face)

diff --git a/roop/typing.py b/roop/typing.py
@@ -1,6 +1,14 @@
-from typing import Any
+from typing import Any, Literal
 from insightface.app.common import Face
 import numpy
 
 Face = Face
 Frame = numpy.ndarray[Any, Any]
+
+FaceRecognition = Literal['reference', 'many']
+FaceAnalyserDirection = Literal['left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small']
+FaceAnalyserAge = Literal['children', 'teenager', 'adult', 'senior']
+FaceAnalyserGender = Literal['male', 'female']
+TempFrameFormat = Literal['jpg', 'png']
+OutputVideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc']
+LogLevel = Literal['error']