From aaf163514127e7d0700c3473500eacc2ed28b00b Mon Sep 17 00:00:00 2001
From: Arfib Manon <manon.arfib@student-cs.fr>
Date: Sun, 21 May 2023 23:03:23 +0200
Subject: [PATCH] =?UTF-8?q?regroupement=20des=20fichiers=20de=20cr=C3=A9at?=
 =?UTF-8?q?ion=20des=20csv=20et=20t=C3=A9l=C3=A9chargement=20des=20fichier?=
 =?UTF-8?q?s=20audio=20sous=20des=20fichiers=20plus=20g=C3=A9n=C3=A9raux?=
 =?UTF-8?q?=20et=20adaptables?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 creation_datasets/README.txt                  |  2 -
 creation_datasets/api_data_test_america.py    | 31 ----------
 creation_datasets/api_data_test_france.py     | 33 ----------
 creation_datasets/api_data_train.py           | 50 ---------------
 .../api_download_test_america.py              | 58 -----------------
 creation_datasets/api_download_test_france.py | 62 -------------------
 creation_datasets/create_csv.py               | 50 +++++++++++++++
 ...pi_download_train.py => download_files.py} | 38 ++++++------
 8 files changed, 68 insertions(+), 256 deletions(-)
 delete mode 100644 creation_datasets/api_data_test_america.py
 delete mode 100644 creation_datasets/api_data_test_france.py
 delete mode 100644 creation_datasets/api_data_train.py
 delete mode 100644 creation_datasets/api_download_test_america.py
 delete mode 100644 creation_datasets/api_download_test_france.py
 create mode 100644 creation_datasets/create_csv.py
 rename creation_datasets/{api_download_train.py => download_files.py} (60%)

diff --git a/creation_datasets/README.txt b/creation_datasets/README.txt
index f53c062..e69de29 100644
--- a/creation_datasets/README.txt
+++ b/creation_datasets/README.txt
@@ -1,2 +0,0 @@
-Pour tÃ©lÃ©charger les fichiers audio nÃ©cessaires au test des modÃ¨les amÃ©ricains : lancer le fichier api_download_test_america.py, les fichiers sont enregistrÃ©s dans un dossier ./creation_dataset/audio_files_test
-Pour tÃ©lÃ©charger les fichiers audio nÃ©cessaires au rÃ©entraÃ®nement du modÃ¨le sur les oiseaux europÃ©ens : lancer le fichier api_download_train.py, les fichiers sont enregistrÃ©s dans un dossier ./creation_dataset/audio_files_france_train
diff --git a/creation_datasets/api_data_test_america.py b/creation_datasets/api_data_test_america.py
deleted file mode 100644
index f70e7f0..0000000
--- a/creation_datasets/api_data_test_america.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import pandas as pd
-import requests
-from tqdm import tqdm
-
-# crÃ©e un fichier csv avec le lien vers 5 fichiers audio pour chaque espÃ¨ce considÃ©rÃ©e (espÃ¨ces de la base de donnÃ©e d'entraÃ®nement des modÃ¨les Ã  tester)
-
-species = ["American Bittern", "American Crow", "American Goldfinch", "American Kestrel", "Buff-bellied Pipit", "American Redstart", "American Robin", "American Wigeon", "American Woodcock", "American Tree Sparrow", "Anna's Hummingbird", "Ash-throated Flycatcher", "Baird's Sandpiper", "Bald Eagle", "Baltimore Oriole", "Sand Martin", "Barn Swallow", "Black-and-white Warbler", "Belted Kingfisher", "Bell's Sparrow", "Bewick's Wren", "Black-billed Cuckoo", "Black-billed Magpie", "Blackburnian Warbler", "Black-capped Chickadee", "Black-chinned Hummingbird", "Black-headed Grosbeak", "Blackpoll Warbler", "Black-throated Sparrow", "Black Phoebe", "Blue Grosbeak", "Blue Jay", "Brown-headed Cowbird", "Bobolink", "Bonaparte's Gull", "Barred Owl", "Brewer's Blackbird", "Brewer's Sparrow", "Brown Creeper", "Brown Thrasher", "Broad-tailed Hummingbird", "Broad-winged Hawk", "Black-throated Blue Warbler", "Black-throated Green Warbler", "Black-throated Grey Warbler", "Bufflehead", "Blue-grey Gnatcatcher", "Blue-headed Vireo", "Bullock's Oriole", "American Bushtit", "Blue-winged Teal", "Blue-winged Warbler", "Cactus Wren", "California Gull", "California Quail", "Cape May Warbler", "Canada Goose", "Canada Warbler", "Canyon Wren", "Carolina Wren", "Cassin's Finch", "Caspian Tern", "Cassin's Vireo", "Cedar Waxwing", "Chipping Sparrow", "Chimney Swift", "Chestnut-sided Warbler", "Chukar Partridge", "Clark's Nutcracker", "American Cliff Swallow", "Common Goldeneye", "Common Grackle", "Common Loon", "Common Merganser", "Common Nighthawk", "Northern Raven", "Common Redpoll", "Common Tern", "Common Yellowthroat", "Cooper's Hawk", "Costa's Hummingbird", "California Scrub Jay", "Dark-eyed Junco", "Double-crested Cormorant", "Downy Woodpecker", "American Dusky Flycatcher", "Black-necked Grebe", "Eastern Bluebird", "Eastern Kingbird", "Eastern Meadowlark", "Eastern Phoebe", "Eastern Towhee", "Eastern Wood Pewee", "Eurasian Collared Dove", "Common Starling", "Evening Grosbeak", "Field Sparrow", "Fish Crow", "Red Fox Sparrow", "Gadwall", "Grey-crowned Rosy Finch", "Green-tailed Towhee", "Eurasian Teal", "Golden-crowned Kinglet", "Golden-crowned Sparrow", "Golden Eagle", "Great Blue Heron", "Great Crested Flycatcher", "Great Egret", "Greater Roadrunner", "Greater Yellowlegs", "Great Horned Owl", "Green Heron", "Great-tailed Grackle", "Grey Catbird", "American Grey Flycatcher", "Hairy Woodpecker", "Hammond's Flycatcher", "European Herring Gull", "Hermit Thrush", "Hooded Merganser", "Hooded Warbler", "Horned Grebe", "Horned Lark", "House Finch", "House Sparrow", "House Wren", "Indigo Bunting", "Juniper Titmouse",          "Killdeer",
-           "Ladder-backed Woodpecker", "Lark Sparrow", "Lazuli Bunting", "Least Bittern", "Least Flycatcher", "Least Sandpiper", "LeConte's Thrasher", "Lesser Goldfinch", "Lesser Nighthawk", "Lesser Yellowlegs", "Lewis' Woodpecker", "Lincoln's Sparrow", "Long-billed Curlew", "Long-billed Dowitcher", "Loggerhead Shrike", "Long-tailed Duck", "Louisiana Waterthrush", "MacGillivray's Warbler", "Magnolia Warbler", "Mallard", "Marsh Wren", "Merlin", "Mountain Bluebird", "Mountain Chickadee", "Mourning Dove", "Northern Cardinal", "Northern Flicker", "Northern Harrier", "Northern Mockingbird", "Northern Parula", "Northern Pintail", "Northern Shoveler", "Northern Waterthrush", "Northern Rough-winged Swallow", "Nuttall's Woodpecker", "Olive-sided Flycatcher", "Orange-crowned Warbler", "Western Osprey", "Ovenbird", "Palm Warbler", "Pacific-slope Flycatcher", "Pectoral Sandpiper", "Peregrine Falcon", "Phainopepla", "Pied-billed Grebe", "Pileated Woodpecker", "Pine Grosbeak", "Pinyon Jay", "Pine Siskin", "Pine Warbler", "Plumbeous Vireo", "Prairie Warbler", "Purple Finch", "Pygmy Nuthatch", "Red-breasted Merganser", "Red-breasted Nuthatch", "Red-breasted Sapsucker", "Red-bellied Woodpecker", "Red Crossbill", "Redhead", "Red-eyed Vireo", "Red-necked Phalarope", "Red-shouldered Hawk", "Red-tailed Hawk", "Red-winged Blackbird", "Ring-billed Gull", "Ring-necked Duck", "Rose-breasted Grosbeak", "Rock Dove", "Rock Wren", "Ruby-throated Hummingbird", "Ruby-crowned Kinglet", "Ruddy Duck", "Ruffed Grouse", "Rufous Hummingbird", "Rusty Blackbird", "Sagebrush Sparrow", "Sage Thrasher", "Savannah Sparrow", "Say's Phoebe", "Scarlet Tanager", "Scott's Oriole", "Semipalmated Plover", "Semipalmated Sandpiper", "Short-eared Owl", "Sharp-shinned Hawk", "Snow Bunting", "Snow Goose", "Solitary Sandpiper", "Song Sparrow", "Sora", "Spotted Sandpiper", "Spotted Towhee", "Steller's Jay", "Swainson's Hawk", "Swamp Sparrow", "Swainson's Thrush", "Tree Swallow", "Trumpeter Swan", "Tufted Titmouse", "Tundra Swan", "Veery", "Vesper Sparrow", "Violet-green Swallow", "Warbling Vireo", "Western Bluebird", "Western Grebe", "Western Kingbird", "Western Meadowlark", "Western Sandpiper", "Western Tanager", "Western Wood Pewee", "White-breasted Nuthatch", "White-crowned Sparrow", "White-faced Ibis", "White-throated Sparrow", "White-throated Swift", "Willow Flycatcher", "Wilson's Snipe", "Wild Turkey", "Winter Wren", "Wilson's Warbler", "Wood Duck", "Woodhouse's Scrub Jay", "Wood Thrush", "American Coot", "Yellow-bellied Flycatcher", "Yellow-bellied Sapsucker", "Yellow-headed Blackbird", "Mangrove Warbler", "Myrtle Warbler", "Yellow-throated Vireo"]
-
-df = pd.DataFrame()
-for specie in species:
-    i = 0
-    print(specie)
-    response = requests.get(
-        f'https://xeno-canto.org/api/2/recordings?query={specie}+since:2020-11-11')
-    js = response.json()
-    ids, files, extensions, ens, lengths, gens = [], [], [], [], [], []
-    for recording in js["recordings"]:
-        if i < 5:
-            ids.append(recording["id"])
-            files.append(recording["file"])
-            extensions.append(recording["file-name"][-4:].lower())
-            ens.append(recording["en"])
-            lengths.append(recording["length"])
-            gens.append(recording["gen"])
-            i += 1
-    df_ = pd.DataFrame.from_records(
-        {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths})
-    df = df.append(df_)
-
-df.to_csv("creation_datasets/fichiers_csv/birds_anerica.csv", index=False)
diff --git a/creation_datasets/api_data_test_france.py b/creation_datasets/api_data_test_france.py
deleted file mode 100644
index 1fa3629..0000000
--- a/creation_datasets/api_data_test_france.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pandas as pd
-import requests
-from tqdm import tqdm
-
-# crÃ©e un fichier csv (birds_france_test.csv) avec le lien vers 5 fichiers audio des espÃ¨ces considÃ©rÃ©es pour l'entraÃ®nement du modÃ¨le franÃ§ais
-
-species = {'Black Redstart': 0, 'Black Woodpecker': 1, 'Black-crowned Night Heron': 2, 'Brambling': 3, 'Carrion Crow': 4, "Cetti's Warbler": 5, 'Cirl Bunting': 6, 'Coal Tit': 7, 'Common Blackbird': 8, 'Common Chaffinch': 9, 'Common Chiffchaff': 10, 'Common Crane': 11, 'Common Cuckoo': 12, 'Common Firecrest': 13, 'Common Linnet': 14, 'Common Moorhen': 15, 'Common Nightingale': 16, 'Common Redstart': 17, 'Common Reed Bunting': 18, 'Common Ringed Plover': 19, 'Common Sandpiper': 20, 'Common Snipe': 21, 'Common Starling': 22, 'Common Whitethroat': 23, 'Corn Bunting': 24, 'Dunlin': 25, 'Dunnock': 26, 'Eurasian Blackcap': 27, 'Eurasian Blue Tit': 28, 'Eurasian Bullfinch': 29, 'Eurasian Coot': 30, 'Eurasian Curlew': 31, 'Eurasian Eagle-Owl': 32, 'Eurasian Golden Oriole': 33, 'Eurasian Jay': 34, 'Eurasian Nuthatch': 35, 'Eurasian Reed Warbler': 36, 'Eurasian Skylark': 37, 'Eurasian Stone-curlew': 38, 'Eurasian Treecreeper': 39, 'Eurasian Wren': 40, 'European Crested Tit': 41, 'European Goldfinch': 42, 'European Green Woodpecker': 43,
-           'European Greenfinch': 44, 'European Nightjar': 45, 'European Pied Flycatcher': 46, 'European Robin': 47, 'European Serin': 48, 'Garden Warbler': 49, 'Goldcrest': 50, 'Great Spotted Woodpecker': 51, 'Great Tit': 52, 'Green Sandpiper': 53, 'Grey Heron': 54, 'House Sparrow': 55, 'Little Bittern': 56, 'Little Grebe': 57, 'Little Owl': 58, 'Long-eared Owl': 59, 'Long-tailed Tit': 60, 'Marsh Tit': 61, 'Marsh Warbler': 62, 'Meadow Pipit': 63, 'Melodious Warbler': 64, 'Middle Spotted Woodpecker': 65, 'Mistle Thrush': 66, 'Ortolan Bunting': 67, 'Red Crossbill': 68, 'Redwing': 69, 'Ring Ouzel': 70, 'Sardinian Warbler': 71, 'Short-toed Treecreeper': 72, 'Song Thrush': 73, 'Spotted Crake': 74, 'Spotted Flycatcher': 75, 'Subalpine Warbler': 76, 'Tawny Owl': 77, 'Tawny Pipit': 78, 'Tree Pipit': 79, 'Water Pipit': 80, 'Water Rail': 81, 'Western Barn Owl': 82, "Western Bonelli's Warbler": 83, 'Western Jackdaw': 84, 'Western Yellow Wagtail': 85, 'White Wagtail': 86, 'Willow Warbler': 87, 'Wood Warbler': 88, 'Woodlark': 89, 'Yellowhammer': 90}
-liste_species = species.keys()
-print(liste_species)
-
-df = pd.DataFrame()
-for specie in liste_species:
-    i = 0
-    print(specie)
-    response = requests.get(
-        f'https://xeno-canto.org/api/2/recordings?query={specie}+since:2021-11-11')
-    js = response.json()
-    ids, files, extensions, ens, lengths, gens = [], [], [], [], [], []
-    for recording in js["recordings"]:
-        if i < 5 and not pd.isnull(recording["file-name"]):
-            ids.append(recording["id"])
-            files.append(recording["file"])
-            extensions.append(recording["file-name"][-4:].lower())
-            ens.append(recording["en"])
-            lengths.append(recording["length"])
-            gens.append(recording["gen"])
-            i += 1
-    df_ = pd.DataFrame.from_records(
-        {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths})
-    df = df.append(df_)
-
-df.to_csv("creation_datasets/fichiers_csv/birds_france_test.csv", index=False)
diff --git a/creation_datasets/api_data_train.py b/creation_datasets/api_data_train.py
deleted file mode 100644
index 720daaf..0000000
--- a/creation_datasets/api_data_train.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import pandas as pd
-import requests
-from tqdm import tqdm
-
-# crÃ©e un fichier csv avec le lien vers tous les fichiers audio enregistrÃ©s en France, pour pouvoir rÃ©entraÃ®ner sur un dataset franÃ§ais le modÃ¨le sÃ©lectionnÃ© qui fonctionnait bien sur les oiseaux europÃ©ens
-
-# species = ["Alder Flycatcher", "American Avocet", "American Bittern", "American Crow", "American Goldfinch", "American Kestrel", "Buff-bellied Pipit", "American Redstart", "American Robin", "American Wigeon", "American Woodcock", "American Tree Sparrow", "Anna's Hummingbird", "Ash-throated Flycatcher", "Baird's Sandpiper", "Bald Eagle", "Baltimore Oriole", "Sand Martin", "Barn Swallow", "Black-and-white Warbler", "Belted Kingfisher", "Bell's Sparrow", "Bewick's Wren", "Black-billed Cuckoo", "Black-billed Magpie", "Blackburnian Warbler", "Black-capped Chickadee", "Black-chinned Hummingbird", "Black-headed Grosbeak", "Blackpoll Warbler", "Black-throated Sparrow", "Black Phoebe", "Blue Grosbeak", "Blue Jay", "Brown-headed Cowbird", "Bobolink", "Bonaparte's Gull", "Barred Owl", "Brewer's Blackbird", "Brewer's Sparrow", "Brown Creeper", "Brown Thrasher", "Broad-tailed Hummingbird", "Broad-winged Hawk", "Black-throated Blue Warbler", "Black-throated Green Warbler", "Black-throated Grey Warbler", "Bufflehead", "Blue-grey Gnatcatcher", "Blue-headed Vireo", "Bullock's Oriole", "American Bushtit", "Blue-winged Teal", "Blue-winged Warbler", "Cactus Wren", "California Gull", "California Quail", "Cape May Warbler", "Canada Goose", "Canada Warbler", "Canyon Wren", "Carolina Wren", "Cassin's Finch", "Caspian Tern", "Cassin's Vireo", "Cedar Waxwing", "Chipping Sparrow", "Chimney Swift", "Chestnut-sided Warbler", "Chukar Partridge", "Clark's Nutcracker", "American Cliff Swallow", "Common Goldeneye", "Common Grackle", "Common Loon", "Common Merganser", "Common Nighthawk", "Northern Raven", "Common Redpoll", "Common Tern", "Common Yellowthroat", "Cooper's Hawk", "Costa's Hummingbird", "California Scrub Jay", "Dark-eyed Junco", "Double-crested Cormorant", "Downy Woodpecker", "American Dusky Flycatcher", "Black-necked Grebe", "Eastern Bluebird", "Eastern Kingbird", "Eastern Meadowlark", "Eastern Phoebe", "Eastern Towhee", "Eastern Wood Pewee", "Eurasian Collared Dove", "Common Starling", "Evening Grosbeak", "Field Sparrow", "Fish Crow", "Red Fox Sparrow", "Gadwall", "Grey-crowned Rosy Finch", "Green-tailed Towhee", "Eurasian Teal", "Golden-crowned Kinglet", "Golden-crowned Sparrow", "Golden Eagle", "Great Blue Heron", "Great Crested Flycatcher", "Great Egret", "Greater Roadrunner", "Greater Yellowlegs", "Great Horned Owl", "Green Heron", "Great-tailed Grackle", "Grey Catbird", "American Grey Flycatcher", "Hairy Woodpecker", "Hammond's Flycatcher", "European Herring Gull", "Hermit Thrush", "Hooded Merganser", "Hooded Warbler", "Horned Grebe", "Horned Lark", "House Finch", "House Sparrow", "House Wren", "Indigo Bunting", "Juniper Titmouse",
-#           "Killdeer", "Ladder-backed Woodpecker", "Lark Sparrow", "Lazuli Bunting", "Least Bittern", "Least Flycatcher", "Least Sandpiper", "LeConte's Thrasher", "Lesser Goldfinch", "Lesser Nighthawk", "Lesser Yellowlegs", "Lewis' Woodpecker", "Lincoln's Sparrow", "Long-billed Curlew", "Long-billed Dowitcher", "Loggerhead Shrike", "Long-tailed Duck", "Louisiana Waterthrush", "MacGillivray's Warbler", "Magnolia Warbler", "Mallard", "Marsh Wren", "Merlin", "Mountain Bluebird", "Mountain Chickadee", "Mourning Dove", "Northern Cardinal", "Northern Flicker", "Northern Harrier", "Northern Mockingbird", "Northern Parula", "Northern Pintail", "Northern Shoveler", "Northern Waterthrush", "Northern Rough-winged Swallow", "Nuttall's Woodpecker", "Olive-sided Flycatcher", "Orange-crowned Warbler", "Western Osprey", "Ovenbird", "Palm Warbler", "Pacific-slope Flycatcher", "Pectoral Sandpiper", "Peregrine Falcon", "Phainopepla", "Pied-billed Grebe", "Pileated Woodpecker", "Pine Grosbeak", "Pinyon Jay", "Pine Siskin", "Pine Warbler", "Plumbeous Vireo", "Prairie Warbler", "Purple Finch", "Pygmy Nuthatch", "Red-breasted Merganser", "Red-breasted Nuthatch", "Red-breasted Sapsucker", "Red-bellied Woodpecker", "Red Crossbill", "Redhead", "Red-eyed Vireo", "Red-necked Phalarope", "Red-shouldered Hawk", "Red-tailed Hawk", "Red-winged Blackbird", "Ring-billed Gull", "Ring-necked Duck", "Rose-breasted Grosbeak", "Rock Dove", "Rock Wren", "Ruby-throated Hummingbird", "Ruby-crowned Kinglet", "Ruddy Duck", "Ruffed Grouse", "Rufous Hummingbird", "Rusty Blackbird", "Sagebrush Sparrow", "Sage Thrasher", "Savannah Sparrow", "Say's Phoebe", "Scarlet Tanager", "Scott's Oriole", "Semipalmated Plover", "Semipalmated Sandpiper", "Short-eared Owl", "Sharp-shinned Hawk", "Snow Bunting", "Snow Goose", "Solitary Sandpiper", "Song Sparrow", "Sora", "Spotted Sandpiper", "Spotted Towhee", "Steller's Jay", "Swainson's Hawk", "Swamp Sparrow", "Swainson's Thrush", "Tree Swallow", "Trumpeter Swan", "Tufted Titmouse", "Tundra Swan", "Veery", "Vesper Sparrow", "Violet-green Swallow", "Warbling Vireo", "Western Bluebird", "Western Grebe", "Western Kingbird", "Western Meadowlark", "Western Sandpiper", "Western Tanager", "Western Wood Pewee", "White-breasted Nuthatch", "White-crowned Sparrow", "White-faced Ibis", "White-throated Sparrow", "White-throated Swift", "Willow Flycatcher", "Wilson's Snipe", "Wild Turkey", "Winter Wren", "Wilson's Warbler", "Wood Duck", "Woodhouse's Scrub Jay", "Wood Thrush", "American Coot", "Yellow-bellied Flycatcher", "Yellow-bellied Sapsucker", "Yellow-headed Blackbird", "Mangrove Warbler", "Myrtle Warbler", "Yellow-throated Vireo"]
-#
-# countries = ["France", "Spain", "Belgium", "Luxembourg", "Italy", "Norway", "Sweden",
-#             "Germany", "Netherlands", "Denmark", "Croatia", "Hungary", "Greece", "Poland", "United Kingdom", "Romania", "Estonia", "Finland", "Portugal", "Ukraine"]
-
-country = ["France"]
-
-
-df = pd.DataFrame()
-for country in country:
-    i = 0
-    print(country)
-    response = requests.get(
-        f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}')
-    js = response.json()
-    ids, files, extensions, ens, lengths, gens, cnts = [], [], [], [], [], [], []
-    for n_page in tqdm(range(1, js["numPages"]+1)):
-        response = requests.get(
-            f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}&page={n_page}')
-        page_js = response.json()
-        for recording in page_js["recordings"]:
-            if not pd.isnull(recording["file-name"]):
-                ids.append(recording["id"])
-                files.append(recording["file"])
-                extensions.append(recording["file-name"][-4:].lower())
-                ens.append(recording["en"])
-                lengths.append(recording["length"])
-                gens.append(recording["gen"])
-                cnts.append(recording["cnt"])
-    df_ = pd.DataFrame.from_records(
-        {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths, "cnt": cnts})
-    df = df.append(df_)
-
-df = df[df['en'] != 'Identity unknown'].copy()
-df = df[df['en'] != 'Soundscape'].copy()
-
-counts = df['en'].value_counts()
-print(counts)
-
-chosen = counts[counts >= 150].index
-df = df[df["en"].isin(chosen)]
-
-df.to_csv("creation_datasets/fichiers_csv/birds_france_train.csv", index=False)
diff --git a/creation_datasets/api_download_test_america.py b/creation_datasets/api_download_test_america.py
deleted file mode 100644
index 9430c86..0000000
--- a/creation_datasets/api_download_test_america.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from pandas import read_csv
-from urllib.request import urlretrieve
-from os import listdir, mkdir, makedirs, path as os_path
-from tqdm import tqdm
-from time import sleep
-import requests
-
-# download all the files from the file csv birds.csv
-
-df = read_csv("creation_datasets/fichiers_csv/birds_america.csv")
-
-# Removing recordings that are not classified
-df = df[df['en'] != 'Identity unknown'].copy()
-df = df[df['en'] != 'Soundscape'].copy()
-counts = df['en'].value_counts()
-print(counts)
-
-
-print(len(df["en"].unique()), "different species")
-
-makedirs('creation_datasets/audio_files_test_america', exist_ok='True')
-audio_dir = 'creation_datasets/audio_files_test_america/'
-
-nots = []
-
-for row in tqdm(df.iterrows(), total=df.shape[0]):
-    url = row[1]["file"]
-    print(url)
-
-    f_name = audio_dir + str(row[1]["id"]) + row[1]["extension"]
-    print(f_name)
-
-    try:
-        #urlretrieve('https:'+url, filename=audio_dir + f_name)
-        response = requests.get(url)
-        open(f_name, 'wb').write(response.content)
-
-    except Exception as e:
-        print("\nRetrying:", url)
-        print(e)
-        sleep(1)
-        try:
-            #urlretrieve('https:'+url, filename=audio_dir + f_name)
-            response = requests.get(url)
-            open(f_name, 'wb').write(response.content)
-        except Exception as ee:
-            print("Not downloaded|", f_name)
-            print(ee)
-            nots.append(row[1]["id"])
-            pass
-
-if len(nots) > 0:
-    with open('not_downloaded.txt', 'w') as f:
-        for item in nots:
-            f.write(str(item) + '\n')
-    print(str(nots))
-else:
-    print('All files were successfully downloaded!')
diff --git a/creation_datasets/api_download_test_france.py b/creation_datasets/api_download_test_france.py
deleted file mode 100644
index 2cd8a01..0000000
--- a/creation_datasets/api_download_test_france.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from pandas import read_csv
-from urllib.request import urlretrieve
-from os import listdir, mkdir, makedirs, path as os_path
-from tqdm import tqdm
-from time import sleep
-import requests
-import pandas as pd
-
-# tÃ©lÃ©charge les fichiers audio du fichier csv birds_france_test.csv
-
-df = read_csv("creation_datasets/fichiers_csv/birds_france_test.csv")
-
-# Removing recordings that are not classified
-df = df[df['en'] != 'Identity unknown'].copy()
-df = df[df['en'] != 'Soundscape'].copy()
-counts = df['en'].value_counts()
-print(counts)
-
-print(len(df["en"].unique()), "different species")
-
-
-makedirs('creation_datasets/audio_files_test_france', exist_ok='True')
-audio_dir = 'creation_datasets/audio_files_test_france/'
-
-nombre_audio = {}
-for specie in df["en"].unique():
-    nombre_audio[specie] = 0
-print(nombre_audio)
-
-
-nots = []
-
-for row in tqdm(df.iterrows(), total=df.shape[0]):
-    url = row[1]["file"]
-    print(url)
-
-    f_name = audio_dir + str(row[1]["id"]) + \
-        row[1]["extension"]
-
-    try:
-        response = requests.get(url)
-        open(f_name, 'wb').write(response.content)
-    except Exception as e:
-        print("\nRetrying:", url)
-        print(e)
-        sleep(1)
-        try:
-            response = requests.get(url)
-            open(f_name, 'wb').write(response.content)
-        except Exception as ee:
-            print("Not downloaded|", f_name)
-            print(ee)
-            nots.append(row[1]["id"])
-            pass
-
-if len(nots) > 0:
-    with open('not_downloaded.txt', 'w') as f:
-        for item in nots:
-            f.write(str(item) + '\n')
-    print(str(nots))
-else:
-    print('All files were successfully downloaded!')
diff --git a/creation_datasets/create_csv.py b/creation_datasets/create_csv.py
new file mode 100644
index 0000000..6f664c6
--- /dev/null
+++ b/creation_datasets/create_csv.py
@@ -0,0 +1,50 @@
+import pandas as pd
+import requests
+from tqdm import tqdm
+import argparse
+
+# writes in the csv the informations to download the files for all species of a country with more than 150 (can be modified) recordings
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "country", help="The country for which you want to create the database. You may chose among this list : ['Arab Emirates', 'Algeria', 'Andorra', 'Angola', 'Antarctica', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Belize', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Congo (Brazzaville)', 'Congo (Democratic Republic)', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Estonia', 'Ethiopia', 'Finland', 'France', 'French Guiana', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kyrgyzstan', 'Laos', 'Latvia', 'Liberia', 'Lithuania', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Malta', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Norway', 'Oman', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda', 'Sao Tome', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Tunisia', 'Turkey', 'Uganda', 'Ukraine', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Zambia', 'Zimbabwe']")
+args = parser.parse_args()
+
+country = args.country
+country = country.replace(" ", "_")
+
+df = pd.DataFrame()
+
+response = requests.get(
+    f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}')
+js = response.json()
+ids, files, extensions, ens, lengths, gens, cnts = [], [], [], [], [], [], []
+for n_page in tqdm(range(1, js["numPages"]+1)):
+    response = requests.get(
+        f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}&page={n_page}')
+    page_js = response.json()
+    for recording in page_js["recordings"]:
+        if not pd.isnull(recording["file-name"]):
+            ids.append(recording["id"])
+            files.append(recording["file"])
+            extensions.append(recording["file-name"][-4:].lower())
+            ens.append(recording["en"])
+            lengths.append(recording["length"])
+            gens.append(recording["gen"])
+            cnts.append(recording["cnt"])
+df_ = pd.DataFrame.from_records(
+    {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths, "cnt": cnts})
+df = df.append(df_)
+
+# removing recordings that are not classified
+df = df[df['en'] != 'Identity unknown'].copy()
+df = df[df['en'] != 'Soundscape'].copy()
+
+counts = df['en'].value_counts()
+
+
+# change the value if you wish to keep more or less species
+chosen = counts[counts >= 150].index
+df = df[df["en"].isin(chosen)]
+
+df.to_csv("creation_datasets/fichiers_csv/"+country+".csv", index=False)
diff --git a/creation_datasets/api_download_train.py b/creation_datasets/download_files.py
similarity index 60%
rename from creation_datasets/api_download_train.py
rename to creation_datasets/download_files.py
index 01cc6db..f2718d9 100644
--- a/creation_datasets/api_download_train.py
+++ b/creation_datasets/download_files.py
@@ -4,44 +4,44 @@ from os import listdir, mkdir, makedirs, path as os_path
 from tqdm import tqdm
 from time import sleep
 import requests
+import argparse
 
-# tÃ©lÃ©charge les 50 premiers fichiers audio pour chaque espÃ¨ce du fichier csv birds_france_train.csv qui compte plus de 150 individus
+parser = argparse.ArgumentParser()
+parser.add_argument("csv_name",
+                    help="Write the name of the csv (with the extension) from which you wish to download the files")
+args = parser.parse_args()
 
-df = read_csv("creation_datasets/fichiers_csv/birds_france_train.csv")
+csv_name = args.csv_name
 
-# Removing recordings that are not classified
+# download the 50 (can be modified) first files for each specie of the csv
+# during the train phase we used 50 files, and during the test phase we used 5 files
 
+df = read_csv("creation_datasets/fichiers_csv/"+csv_name)
 
-print(df.shape)
 print(len(df["en"].unique()), "different species")
-print(df)
+
+makedirs('creation_datasets/audio_files_'+csv_name[:-4], exist_ok='True')
+audio_dir = 'creation_datasets/audio_files_'+csv_name[:-4]+'/'
+
+nots = []
 
 nombre_audio = {}
 for specie in df["en"].unique():
     nombre_audio[specie] = 0
-print(nombre_audio)
-
-makedirs('creation_datasets/audio_files_france_train', exist_ok='True')
-audio_dir = 'creation_datasets/audio_files_france_train/'
-
-nots = []
 
 for row in tqdm(df.iterrows(), total=df.shape[0]):
-    url = row[1]["file"]
-    print(url)
 
+    url = row[1]["file"]
     specie = row[1]["en"]
 
+    # change here to modify the number of files to download for each specie
     if nombre_audio[specie] < 50:
 
-        f_name = audio_dir+str(row[1]["id"]) + \
-            row[1]["extension"]
-        print(f_name)
+        f_name = audio_dir + str(row[1]["id"]) + row[1]["extension"]
+        print("Downloading", f_name)
         nombre_audio[specie] += 1
-        print(nombre_audio[specie])
 
         try:
-            #urlretrieve('https:'+url, filename=audio_dir + f_name)
             response = requests.get(url)
             open(f_name, 'wb').write(response.content)
 
@@ -50,12 +50,10 @@ for row in tqdm(df.iterrows(), total=df.shape[0]):
             print(e)
             sleep(1)
             try:
-                #urlretrieve('https:'+url, filename=audio_dir + f_name)
                 response = requests.get(url)
                 open(f_name, 'wb').write(response.content)
             except Exception as ee:
                 print("Not downloaded|", f_name)
-                print(ee)
                 nots.append(row[1]["id"])
                 pass
 
-- 
GitLab