mangadex-home-rs/src/cache/fs.rs

230 lines
7.4 KiB
Rust
Raw Normal View History

2021-04-18 16:14:36 -07:00
use actix_web::error::PayloadError;
2021-04-19 19:01:32 -07:00
use bytes::Buf;
2021-04-18 20:06:18 -07:00
use futures::{Stream, StreamExt};
2021-04-19 19:01:32 -07:00
use log::{debug, error};
2021-04-18 14:06:40 -07:00
use once_cell::sync::Lazy;
2021-04-18 20:06:18 -07:00
use std::collections::HashMap;
use std::fmt::Display;
2021-04-09 19:00:59 -07:00
use std::path::{Path, PathBuf};
use std::pin::Pin;
2021-04-09 18:59:29 -07:00
use std::task::{Context, Poll};
2021-04-18 20:06:18 -07:00
use tokio::fs::{create_dir_all, remove_file, File};
2021-04-09 18:59:29 -07:00
use tokio::io::{AsyncRead, AsyncWriteExt, ReadBuf};
2021-04-19 19:01:32 -07:00
use tokio::sync::mpsc::UnboundedSender;
use tokio::sync::watch::{channel, Receiver};
2021-04-17 20:19:27 -07:00
use tokio::sync::RwLock;
2021-04-19 19:01:32 -07:00
use tokio_stream::wrappers::WatchStream;
2021-04-18 20:06:18 -07:00
use tokio_util::codec::{BytesCodec, FramedRead};
2021-04-09 18:59:29 -07:00
2021-04-18 20:06:18 -07:00
use super::{BoxedImageStream, CacheStream, CacheStreamItem};
2021-04-18 14:06:40 -07:00
2021-04-09 20:15:44 -07:00
/// Keeps track of files that are currently being written to.
///
/// Why is this necessary? Consider the following situation:
///
/// Client A requests file `foo.png`. We construct a transparent file stream,
/// and now the file is being streamed into and from.
///
/// Client B requests the same file `foo.png`. A naive implementation would
/// attempt to either read directly the file as it sees the file existing. This
/// is problematic as the file could still be written to. If Client B catches
/// up to Client A's request, then Client B could receive a broken image, as it
/// thinks it's done reading the file.
///
2021-04-14 19:52:54 -07:00
/// We effectively use `WRITING_STATUS` as a status relay to ensure concurrent
2021-04-09 20:15:44 -07:00
/// reads to the file while it's being written to will wait for writing to be
/// completed.
2021-04-19 19:01:32 -07:00
static WRITING_STATUS: Lazy<RwLock<HashMap<PathBuf, Receiver<WritingStatus>>>> =
2021-04-09 18:59:29 -07:00
Lazy::new(|| RwLock::new(HashMap::new()));
2021-04-09 20:15:44 -07:00
/// Tries to read from the file, returning a byte stream if it exists
2021-04-18 20:06:18 -07:00
pub async fn read_file(path: &Path) -> Option<Result<CacheStream, std::io::Error>> {
2021-04-09 20:15:44 -07:00
if path.exists() {
2021-04-19 19:01:32 -07:00
let status = WRITING_STATUS.read().await.get(path).map(Clone::clone);
2021-04-18 20:06:18 -07:00
if let Some(status) = status {
Some(
2021-04-19 19:01:32 -07:00
ConcurrentFsStream::new(path, WatchStream::new(status))
2021-04-18 20:06:18 -07:00
.await
.map(CacheStream::Concurrent),
)
} else {
Some(
File::open(path)
.await
.map(|f| CacheStream::Completed(FramedRead::new(f, BytesCodec::new()))),
)
}
2021-04-09 20:15:44 -07:00
} else {
None
}
}
/// Maps the input byte stream into one that writes to disk instead, returning
/// a stream that reads from disk instead.
2021-04-17 20:19:27 -07:00
pub async fn write_file(
2021-04-09 18:59:29 -07:00
path: &Path,
2021-04-18 14:06:40 -07:00
mut byte_stream: BoxedImageStream,
2021-04-19 19:01:32 -07:00
notifier: UnboundedSender<u64>,
2021-04-18 20:06:18 -07:00
) -> Result<CacheStream, std::io::Error> {
2021-04-19 19:01:32 -07:00
let (tx, rx) = channel(WritingStatus::NotDone);
2021-04-09 20:15:44 -07:00
let mut file = {
2021-04-17 20:19:27 -07:00
let mut write_lock = WRITING_STATUS.write().await;
2021-04-18 20:06:18 -07:00
let parent = path.parent().unwrap();
create_dir_all(parent).await?;
2021-04-19 19:01:32 -07:00
let file = File::create(path).await?; // we need to make sure the file exists and is truncated.
write_lock.insert(path.to_path_buf(), rx.clone());
2021-04-09 20:15:44 -07:00
file
};
// need owned variant because async lifetime
let path_buf = path.to_path_buf();
tokio::spawn(async move {
let path_buf = path_buf; // moves path buf into async
2021-04-14 19:52:54 -07:00
let mut errored = false;
2021-04-19 19:01:32 -07:00
let mut bytes_written: u64 = 0;
2021-04-09 20:15:44 -07:00
while let Some(bytes) = byte_stream.next().await {
2021-04-19 19:01:32 -07:00
if let Ok(mut bytes) = bytes {
loop {
match file.write(&bytes).await? {
0 => break,
n => {
bytes.advance(n);
// We don't care if we don't have receivers
bytes_written += n as u64;
let _ = tx.send(WritingStatus::NotDone);
}
}
}
2021-04-14 19:52:54 -07:00
} else {
errored = true;
break;
2021-04-09 18:59:29 -07:00
}
2021-04-09 20:15:44 -07:00
}
2021-04-09 18:59:29 -07:00
2021-04-14 19:52:54 -07:00
if errored {
2021-04-09 20:15:44 -07:00
// It's ok if the deleting the file fails, since we truncate on
2021-04-14 19:11:00 -07:00
// create anyways, but it should be best effort
2021-04-09 20:15:44 -07:00
let _ = remove_file(&path_buf).await;
} else {
file.flush().await?;
file.sync_all().await?; // we need metadata
2021-04-18 20:06:18 -07:00
debug!("writing to file done");
2021-04-09 20:15:44 -07:00
}
2021-04-09 18:59:29 -07:00
2021-04-17 20:19:27 -07:00
let mut write_lock = WRITING_STATUS.write().await;
2021-04-09 20:15:44 -07:00
// This needs to be written atomically with the write lock, else
// it's possible we have an inconsistent state
2021-04-19 19:01:32 -07:00
//
// We don't really care if we have no receivers
2021-04-14 19:52:54 -07:00
if errored {
2021-04-19 19:01:32 -07:00
let _ = tx.send(WritingStatus::Error);
2021-04-09 20:15:44 -07:00
} else {
2021-04-19 19:01:32 -07:00
let _ = tx.send(WritingStatus::Done);
2021-04-09 20:15:44 -07:00
}
write_lock.remove(&path_buf);
2021-04-09 18:59:29 -07:00
2021-04-19 19:01:32 -07:00
// notify
if let Err(e) = notifier.send(bytes_written) {
error!(
"Failed to notify cache of new entry size: {}. Cache no longer can prune FS!",
e
);
}
2021-04-09 20:15:44 -07:00
// We don't ever check this, so the return value doesn't matter
Ok::<_, std::io::Error>(())
});
2021-04-09 18:59:29 -07:00
2021-04-18 20:06:18 -07:00
Ok(CacheStream::Concurrent(
2021-04-19 19:01:32 -07:00
ConcurrentFsStream::new(path, WatchStream::new(rx)).await?,
2021-04-18 20:06:18 -07:00
))
2021-04-09 18:59:29 -07:00
}
2021-04-18 20:06:18 -07:00
pub struct ConcurrentFsStream {
2021-04-09 18:59:29 -07:00
file: Pin<Box<File>>,
2021-04-19 19:01:32 -07:00
receiver: Pin<Box<WatchStream<WritingStatus>>>,
2021-04-09 18:59:29 -07:00
}
2021-04-18 20:06:18 -07:00
impl ConcurrentFsStream {
2021-04-19 19:01:32 -07:00
async fn new(
path: &Path,
receiver: WatchStream<WritingStatus>,
) -> Result<Self, std::io::Error> {
2021-04-09 18:59:29 -07:00
Ok(Self {
file: Box::pin(File::open(path).await?),
2021-04-19 19:01:32 -07:00
receiver: Box::pin(receiver),
2021-04-09 18:59:29 -07:00
})
}
}
/// Represents some upstream error.
2021-04-18 14:06:40 -07:00
#[derive(Debug)]
2021-04-09 19:00:09 -07:00
pub struct UpstreamError;
2021-04-09 18:59:29 -07:00
2021-04-18 14:06:40 -07:00
impl std::error::Error for UpstreamError {}
impl Display for UpstreamError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "An upstream error occurred")
}
}
2021-04-18 20:06:18 -07:00
impl Stream for ConcurrentFsStream {
2021-04-18 14:06:40 -07:00
type Item = CacheStreamItem;
2021-04-09 18:59:29 -07:00
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
2021-04-19 19:01:32 -07:00
match self.receiver.as_mut().poll_next_unpin(cx) {
Poll::Ready(status) => {
let mut bytes = [0; 1460].to_vec();
let mut buffer = ReadBuf::new(&mut bytes);
let polled_result = self.file.as_mut().poll_read(cx, &mut buffer);
let filled = buffer.filled().len();
match (status, filled) {
(Some(WritingStatus::NotDone), 0) => Poll::Pending,
// We got an error, abort the read.
(Some(WritingStatus::Error), _) => Poll::Ready(Some(Err(UpstreamError))),
_ => {
bytes.truncate(filled);
polled_result.map(|_| {
if bytes.is_empty() {
None
} else {
Some(Ok(bytes.into()))
}
})
2021-04-18 20:06:18 -07:00
}
2021-04-19 19:01:32 -07:00
}
2021-04-18 20:06:18 -07:00
}
2021-04-19 19:01:32 -07:00
Poll::Pending => Poll::Pending,
2021-04-09 18:59:29 -07:00
}
}
}
2021-04-18 14:06:40 -07:00
impl From<UpstreamError> for actix_web::Error {
2021-04-18 14:11:30 -07:00
#[inline]
2021-04-18 14:06:40 -07:00
fn from(_: UpstreamError) -> Self {
2021-04-18 16:14:36 -07:00
PayloadError::Incomplete(None).into()
2021-04-18 14:06:40 -07:00
}
}
2021-04-19 19:01:32 -07:00
#[derive(Debug, Clone, Copy)]
2021-04-09 18:59:29 -07:00
enum WritingStatus {
NotDone = 0,
Done,
Error,
}
impl From<u8> for WritingStatus {
#[inline]
fn from(v: u8) -> Self {
match v {
0 => Self::NotDone,
1 => Self::Done,
2 => Self::Error,
_ => unreachable!(),
}
}
}